legion-data 1.6.11 → 1.6.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c0e428fc5d4a92e8fe8b3c24f2277f5842e13b0c9814f712c82cc7cc41a43cfc
4
- data.tar.gz: b9d1dd28db840e02b4308d2b29b20a1ff89b7932c07952f243b64cad6b45d5c1
3
+ metadata.gz: 17ce25b6562386687fec0e9647a28005e54f430a2186ffd841c345b8e5c98a55
4
+ data.tar.gz: 6227a1417584976e995d4868d4a8ef6d5084a9e7cad49963f7155cb9a7e99b46
5
5
  SHA512:
6
- metadata.gz: df6941f0d373dbb1d18aadd1b75ee092ec58d8da8fb82b273e6b847f8cf420e999212d0323171bfdf965fe4b105e0e4f13bbe62d3f29ac77f63c35477a4f78c7
7
- data.tar.gz: d64f91ecafd301f33153e5bc77988014f11ceb2a3cc0eacf0201bcb19250adc7f71888004643d45ab967cf230ca82fb1986b166b389f1b08a4c098890fed98c1
6
+ metadata.gz: b066c7ff24ba343941e507616899e77eeb3804a6b83ee0b3954c5646e3ea097daf5402785ef27dc7bba01d5c8742fe7fb23b15d5134a5db058dec6999b5b4b26
7
+ data.tar.gz: f27b514fec0fb8c2d9d8157f23a69f0c7f81d0eade28ec8f824ef44132d48d40db0c5ff1e97da9e7e8a647fd97314be5d4b3fe3edad8187ebfe42a541d73a281
data/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  # Legion::Data Changelog
2
2
 
3
+ ## [1.6.12] - 2026-03-28
4
+
5
+ ### Added
6
+ - VTT (WebVTT) extract handler for meeting transcript parsing (`Handlers::Vtt`)
7
+ - Parses speaker tags (`<v SpeakerName>`), timestamps, and WEBVTT header
8
+ - `preserve_speakers: true` (default) prefixes each line with speaker name
9
+ - Accepts inline VTT string content or a file path
10
+ - Returns `{ text:, metadata: { bytes:, speakers:, line_count: } }`
11
+ - `.vtt` extension registered in `TypeDetector::EXTENSION_MAP` (maps to `:vtt`)
12
+
3
13
  ## [1.6.11] - 2026-03-28
4
14
 
5
15
  ### Added
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Vtt < Base
8
+ TIMESTAMP_PATTERN = /^\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}/
9
+ SPEAKER_TAG_PATTERN = /^<v ([^>]+)>(.*)$/
10
+
11
+ def self.type = :vtt
12
+ def self.extensions = %w[.vtt]
13
+ def self.gem_name = nil
14
+
15
+ def self.extract(source, preserve_speakers: true)
16
+ content = if source.respond_to?(:read)
17
+ source.read
18
+ elsif source.is_a?(String) && source.include?("\n")
19
+ source
20
+ else
21
+ File.read(source.to_s)
22
+ end
23
+ lines = parse_vtt(content, preserve_speakers: preserve_speakers)
24
+ text = lines.join("\n")
25
+ speakers = extract_speakers(content)
26
+ {
27
+ text: text,
28
+ metadata: {
29
+ bytes: content.bytesize,
30
+ speakers: speakers,
31
+ line_count: lines.size
32
+ }
33
+ }
34
+ rescue StandardError => e
35
+ { text: nil, error: e.message }
36
+ end
37
+
38
+ def self.parse_vtt(content, preserve_speakers: true)
39
+ lines = []
40
+ content.each_line do |raw|
41
+ line = raw.strip
42
+ next if line.empty?
43
+ next if line == 'WEBVTT'
44
+ next if TIMESTAMP_PATTERN.match?(line)
45
+
46
+ if (match = SPEAKER_TAG_PATTERN.match(line))
47
+ speaker = match[1].strip
48
+ text = match[2].strip
49
+ lines << (preserve_speakers ? "#{speaker}: #{text}" : text)
50
+ else
51
+ lines << line
52
+ end
53
+ end
54
+ lines
55
+ end
56
+
57
+ def self.extract_speakers(content)
58
+ content.scan(SPEAKER_TAG_PATTERN).map { |m| m[0].strip }.uniq
59
+ end
60
+ private_class_method :parse_vtt, :extract_speakers
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
@@ -17,7 +17,8 @@ module Legion
17
17
  '.json' => :json,
18
18
  '.jsonl' => :jsonl,
19
19
  '.html' => :html,
20
- '.htm' => :html
20
+ '.htm' => :html,
21
+ '.vtt' => :vtt
21
22
  }.freeze
22
23
 
23
24
  module_function
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Legion
4
4
  module Data
5
- VERSION = '1.6.11'
5
+ VERSION = '1.6.12'
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: legion-data
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.6.11
4
+ version: 1.6.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esity
@@ -123,6 +123,7 @@ files:
123
123
  - lib/legion/data/extract/handlers/pdf.rb
124
124
  - lib/legion/data/extract/handlers/pptx.rb
125
125
  - lib/legion/data/extract/handlers/text.rb
126
+ - lib/legion/data/extract/handlers/vtt.rb
126
127
  - lib/legion/data/extract/handlers/xlsx.rb
127
128
  - lib/legion/data/extract/type_detector.rb
128
129
  - lib/legion/data/helper.rb