legion-data 1.6.11 → 1.6.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/lib/legion/data/extract/handlers/vtt.rb +65 -0
- data/lib/legion/data/extract/type_detector.rb +2 -1
- data/lib/legion/data/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 17ce25b6562386687fec0e9647a28005e54f430a2186ffd841c345b8e5c98a55
|
|
4
|
+
data.tar.gz: 6227a1417584976e995d4868d4a8ef6d5084a9e7cad49963f7155cb9a7e99b46
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b066c7ff24ba343941e507616899e77eeb3804a6b83ee0b3954c5646e3ea097daf5402785ef27dc7bba01d5c8742fe7fb23b15d5134a5db058dec6999b5b4b26
|
|
7
|
+
data.tar.gz: f27b514fec0fb8c2d9d8157f23a69f0c7f81d0eade28ec8f824ef44132d48d40db0c5ff1e97da9e7e8a647fd97314be5d4b3fe3edad8187ebfe42a541d73a281
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
# Legion::Data Changelog
|
|
2
2
|
|
|
3
|
+
## [1.6.12] - 2026-03-28
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- VTT (WebVTT) extract handler for meeting transcript parsing (`Handlers::Vtt`)
|
|
7
|
+
- Parses speaker tags (`<v SpeakerName>`), timestamps, and WEBVTT header
|
|
8
|
+
- `preserve_speakers: true` (default) prefixes each line with speaker name
|
|
9
|
+
- Accepts inline VTT string content or a file path
|
|
10
|
+
- Returns `{ text:, metadata: { bytes:, speakers:, line_count: } }`
|
|
11
|
+
- `.vtt` extension registered in `TypeDetector::EXTENSION_MAP` (maps to `:vtt`)
|
|
12
|
+
|
|
3
13
|
## [1.6.11] - 2026-03-28
|
|
4
14
|
|
|
5
15
|
### Added
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Vtt < Base
|
|
8
|
+
TIMESTAMP_PATTERN = /^\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}/
|
|
9
|
+
SPEAKER_TAG_PATTERN = /^<v ([^>]+)>(.*)$/
|
|
10
|
+
|
|
11
|
+
def self.type = :vtt
|
|
12
|
+
def self.extensions = %w[.vtt]
|
|
13
|
+
def self.gem_name = nil
|
|
14
|
+
|
|
15
|
+
def self.extract(source, preserve_speakers: true)
|
|
16
|
+
content = if source.respond_to?(:read)
|
|
17
|
+
source.read
|
|
18
|
+
elsif source.is_a?(String) && source.include?("\n")
|
|
19
|
+
source
|
|
20
|
+
else
|
|
21
|
+
File.read(source.to_s)
|
|
22
|
+
end
|
|
23
|
+
lines = parse_vtt(content, preserve_speakers: preserve_speakers)
|
|
24
|
+
text = lines.join("\n")
|
|
25
|
+
speakers = extract_speakers(content)
|
|
26
|
+
{
|
|
27
|
+
text: text,
|
|
28
|
+
metadata: {
|
|
29
|
+
bytes: content.bytesize,
|
|
30
|
+
speakers: speakers,
|
|
31
|
+
line_count: lines.size
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
rescue StandardError => e
|
|
35
|
+
{ text: nil, error: e.message }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def self.parse_vtt(content, preserve_speakers: true)
|
|
39
|
+
lines = []
|
|
40
|
+
content.each_line do |raw|
|
|
41
|
+
line = raw.strip
|
|
42
|
+
next if line.empty?
|
|
43
|
+
next if line == 'WEBVTT'
|
|
44
|
+
next if TIMESTAMP_PATTERN.match?(line)
|
|
45
|
+
|
|
46
|
+
if (match = SPEAKER_TAG_PATTERN.match(line))
|
|
47
|
+
speaker = match[1].strip
|
|
48
|
+
text = match[2].strip
|
|
49
|
+
lines << (preserve_speakers ? "#{speaker}: #{text}" : text)
|
|
50
|
+
else
|
|
51
|
+
lines << line
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
lines
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def self.extract_speakers(content)
|
|
58
|
+
content.scan(SPEAKER_TAG_PATTERN).map { |m| m[0].strip }.uniq
|
|
59
|
+
end
|
|
60
|
+
private_class_method :parse_vtt, :extract_speakers
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
data/lib/legion/data/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: legion-data
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.6.
|
|
4
|
+
version: 1.6.12
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Esity
|
|
@@ -123,6 +123,7 @@ files:
|
|
|
123
123
|
- lib/legion/data/extract/handlers/pdf.rb
|
|
124
124
|
- lib/legion/data/extract/handlers/pptx.rb
|
|
125
125
|
- lib/legion/data/extract/handlers/text.rb
|
|
126
|
+
- lib/legion/data/extract/handlers/vtt.rb
|
|
126
127
|
- lib/legion/data/extract/handlers/xlsx.rb
|
|
127
128
|
- lib/legion/data/extract/type_detector.rb
|
|
128
129
|
- lib/legion/data/helper.rb
|