tabula-rb 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +39 -0
  4. data/CHANGELOG.md +59 -0
  5. data/LICENSE +21 -0
  6. data/README.md +176 -0
  7. data/Rakefile +28 -0
  8. data/exe/tabula +7 -0
  9. data/lib/tabula/algorithms/cohen_sutherland_clipping.rb +94 -0
  10. data/lib/tabula/algorithms/projection_profile.rb +109 -0
  11. data/lib/tabula/cli.rb +271 -0
  12. data/lib/tabula/configuration.rb +119 -0
  13. data/lib/tabula/core/point.rb +60 -0
  14. data/lib/tabula/core/rectangle.rb +218 -0
  15. data/lib/tabula/core/ruling.rb +303 -0
  16. data/lib/tabula/core/spatial_index.rb +120 -0
  17. data/lib/tabula/detectors/detection_algorithm.rb +34 -0
  18. data/lib/tabula/detectors/nurminen_detection_algorithm.rb +211 -0
  19. data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb +142 -0
  20. data/lib/tabula/extractors/basic_extraction_algorithm.rb +168 -0
  21. data/lib/tabula/extractors/extraction_algorithm.rb +34 -0
  22. data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +299 -0
  23. data/lib/tabula/pdf/object_extractor.rb +400 -0
  24. data/lib/tabula/pdf/page.rb +230 -0
  25. data/lib/tabula/pdf/text_stripper.rb +150 -0
  26. data/lib/tabula/table/cell.rb +110 -0
  27. data/lib/tabula/table/table.rb +184 -0
  28. data/lib/tabula/text/line.rb +133 -0
  29. data/lib/tabula/text/text_chunk.rb +185 -0
  30. data/lib/tabula/text/text_element.rb +120 -0
  31. data/lib/tabula/version.rb +5 -0
  32. data/lib/tabula/writers/csv_writer.rb +49 -0
  33. data/lib/tabula/writers/json_writer.rb +41 -0
  34. data/lib/tabula/writers/markdown_writer.rb +71 -0
  35. data/lib/tabula/writers/tsv_writer.rb +35 -0
  36. data/lib/tabula/writers/writer.rb +39 -0
  37. data/lib/tabula.rb +160 -0
  38. data/mise.toml +2 -0
  39. data/tabula-rb.gemspec +44 -0
  40. metadata +115 -0
@@ -0,0 +1,185 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ # Represents a group of text elements (typically a word or phrase).
5
+ # Extends Rectangle to provide bounding box functionality.
6
+ class TextChunk < Rectangle
7
+ attr_reader :elements
8
+
9
+ # @param element_or_rect [TextElement, Rectangle] initial element or bounds
10
+ def initialize(element_or_rect = nil)
11
+ if element_or_rect.is_a?(TextElement)
12
+ super(element_or_rect.top, element_or_rect.left, element_or_rect.width, element_or_rect.height)
13
+ @elements = [element_or_rect]
14
+ elsif element_or_rect.is_a?(Rectangle)
15
+ super(element_or_rect.top, element_or_rect.left, element_or_rect.width, element_or_rect.height)
16
+ @elements = []
17
+ elsif element_or_rect.nil?
18
+ super(0, 0, 0, 0)
19
+ @elements = []
20
+ else
21
+ raise ArgumentError, 'Expected TextElement, Rectangle, or nil'
22
+ end
23
+ end
24
+
25
+ # Add a text element to this chunk
26
+ # @param element [TextElement] element to add
27
+ def add(element)
28
+ @elements << element
29
+ merge!(element)
30
+ self
31
+ end
32
+
33
+ # Add multiple elements
34
+ # @param elements [Array<TextElement>] elements to add
35
+ def add_all(elements)
36
+ elements.each { |e| add(e) }
37
+ self
38
+ end
39
+
40
+ # Get the combined text content
41
+ # @param normalize [Boolean] whether to normalize whitespace
42
+ # @return [String] the text content
43
+ def text(normalize: true)
44
+ # Sort elements based on text direction
45
+ sorted = if ltr_dominant?
46
+ @elements.sort_by(&:left)
47
+ else
48
+ @elements.sort_by(&:left).reverse
49
+ end
50
+ raw = sorted.map(&:text).join
51
+ normalize ? raw.gsub(/\s+/, ' ').strip : raw
52
+ end
53
+
54
+ # Check if this chunk is RTL dominant
55
+ def rtl_dominant?
56
+ !ltr_dominant?
57
+ end
58
+
59
+ # Get width of space character for this chunk
60
+ def width_of_space
61
+ @elements.map(&:width_of_space).compact.first
62
+ end
63
+
64
+ # Get font name
65
+ def font_name
66
+ @elements.first&.font_name
67
+ end
68
+
69
+ # Get font size
70
+ def font_size
71
+ @elements.first&.font_size
72
+ end
73
+
74
+ # Check if this chunk contains only a single repeated character
75
+ # @param chars [Array<String>] characters to check for
76
+ # @return [Boolean]
77
+ def same_char?(chars)
78
+ return false if @elements.empty?
79
+
80
+ @elements.all? { |e| chars.include?(e.text) }
81
+ end
82
+
83
+ # Remove runs of identical characters
84
+ # @param char [String] character to squeeze
85
+ # @param min_run [Integer] minimum run length to squeeze
86
+ # @return [TextChunk] new chunk with squeezed text
87
+ def squeeze(char, min_run: 3)
88
+ return self if @elements.size < min_run
89
+
90
+ new_chunk = TextChunk.new(Rectangle.new(top, left, width, height))
91
+ run_count = 0
92
+
93
+ @elements.each do |element|
94
+ if element.text == char
95
+ run_count += 1
96
+ new_chunk.add(element) if run_count <= 1
97
+ else
98
+ run_count = 0
99
+ new_chunk.add(element)
100
+ end
101
+ end
102
+
103
+ new_chunk
104
+ end
105
+
106
+ # Check if LTR text is dominant in this chunk
107
+ def ltr_dominant?
108
+ ltr_count = @elements.count(&:ltr?)
109
+ rtl_count = @elements.count(&:rtl?)
110
+ ltr_count >= rtl_count
111
+ end
112
+
113
+ # Split this chunk at an index
114
+ # @param index [Integer] element index to split at
115
+ # @return [Array<TextChunk>] two chunks, before and after the split
116
+ def split_at(index)
117
+ return [dup, TextChunk.new] if index >= @elements.size
118
+ return [TextChunk.new, dup] if index <= 0
119
+
120
+ left_chunk = TextChunk.new
121
+ right_chunk = TextChunk.new
122
+
123
+ @elements[0...index].each { |e| left_chunk.add(e) }
124
+ @elements[index..].each { |e| right_chunk.add(e) }
125
+
126
+ [left_chunk, right_chunk]
127
+ end
128
+
129
+ # Merge with another chunk
130
+ # @param other [TextChunk] chunk to merge
131
+ # @return [TextChunk] self
132
+ def merge_chunk(other)
133
+ other.elements.each { |e| add(e) }
134
+ self
135
+ end
136
+
137
+ def to_s
138
+ "TextChunk[#{text.inspect}](#{left}, #{top}, #{width}, #{height})"
139
+ end
140
+
141
+ def inspect
142
+ to_s
143
+ end
144
+
145
+ def empty?
146
+ @elements.empty?
147
+ end
148
+
149
+ def size
150
+ @elements.size
151
+ end
152
+
153
+ class << self
154
+ # Check if all chunks contain the same repeated character
155
+ # @param chunks [Array<TextChunk>] chunks to check
156
+ # @param chars [Array<String>] characters to check for
157
+ # @return [Boolean]
158
+ def all_same_char?(chunks, chars)
159
+ chunks.all? { |c| c.same_char?(chars) }
160
+ end
161
+
162
+ # Group text chunks into lines
163
+ # @param chunks [Array<TextChunk>] chunks to group
164
+ # @return [Array<Line>] lines of text
165
+ def group_by_lines(chunks)
166
+ return [] if chunks.empty?
167
+
168
+ sorted = chunks.sort_by { |c| [c.top, c.left] }
169
+ lines = []
170
+ current_line = Line.new
171
+
172
+ sorted.each do |chunk|
173
+ unless current_line.empty? || current_line.vertically_overlaps?(chunk)
174
+ lines << current_line
175
+ current_line = Line.new
176
+ end
177
+ current_line.add_chunk(chunk)
178
+ end
179
+
180
+ lines << current_line unless current_line.empty?
181
+ lines
182
+ end
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,120 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ # Represents a single text element (character or glyph) extracted from a PDF.
5
+ # Contains position, dimensions, and font information.
6
+ class TextElement < Rectangle
7
+ # Text direction constants
8
+ DIRECTION_LTR = 0
9
+ DIRECTION_RTL = 1
10
+
11
+ attr_reader :text, :font_name, :font_size, :width_of_space, :direction
12
+
13
+ # @param top [Float] top coordinate
14
+ # @param left [Float] left coordinate
15
+ # @param width [Float] width
16
+ # @param height [Float] height
17
+ # @param text [String] the text content
18
+ # @param font_name [String] name of the font
19
+ # @param font_size [Float] font size in points
20
+ # @param width_of_space [Float] width of space character in this font
21
+ # @param direction [Integer] text direction (LTR or RTL)
22
+ def initialize(top:, left:, width:, height:, text:, font_name: nil, font_size: nil,
23
+ width_of_space: nil, direction: DIRECTION_LTR)
24
+ super(top, left, width, height)
25
+ @text = text
26
+ @font_name = font_name
27
+ @font_size = font_size&.to_f
28
+ @width_of_space = width_of_space&.to_f
29
+ @direction = direction
30
+ end
31
+
32
+ def ltr?
33
+ direction == DIRECTION_LTR
34
+ end
35
+
36
+ def rtl?
37
+ direction == DIRECTION_RTL
38
+ end
39
+
40
+ # Check if this element is whitespace
41
+ def whitespace?
42
+ text.nil? || text.strip.empty?
43
+ end
44
+
45
+ def to_s
46
+ "TextElement[#{text.inspect}](#{left}, #{top}, #{width}, #{height})"
47
+ end
48
+
49
+ def inspect
50
+ to_s
51
+ end
52
+
53
+ def ==(other)
54
+ return false unless other.is_a?(TextElement)
55
+
56
+ super && text == other.text && font_name == other.font_name &&
57
+ font_size == other.font_size
58
+ end
59
+ alias eql? ==
60
+
61
+ def hash
62
+ [super, text, font_name, font_size].hash
63
+ end
64
+
65
+ class << self
66
+ # Merge text elements into text chunks (words)
67
+ # @param elements [Array<TextElement>] text elements to merge
68
+ # @param vertical_rulings [Array<Ruling>] vertical rulings that act as word separators
69
+ # @return [Array<TextChunk>] merged text chunks
70
+ def merge_words(elements, vertical_rulings: [])
71
+ return [] if elements.empty?
72
+
73
+ chunks = []
74
+ current_chunk = nil
75
+
76
+ # Sort by top first, then by left (RTL sorting handled in text assembly)
77
+ sorted = elements.reject(&:whitespace?).sort_by { |e| [e.top, e.left] }
78
+
79
+ sorted.each do |element|
80
+ if current_chunk.nil?
81
+ current_chunk = TextChunk.new(element)
82
+ elsif should_merge?(current_chunk, element, vertical_rulings)
83
+ current_chunk.add(element)
84
+ else
85
+ chunks << current_chunk
86
+ current_chunk = TextChunk.new(element)
87
+ end
88
+ end
89
+
90
+ chunks << current_chunk if current_chunk
91
+ chunks
92
+ end
93
+
94
+ private
95
+
96
+ def should_merge?(chunk, element, vertical_rulings)
97
+ return false unless chunk.vertically_overlaps?(element)
98
+
99
+ # Check if there's a vertical ruling between them
100
+ return false if vertical_rulings.any? { |r| ruling_between?(chunk, element, r) }
101
+
102
+ # Check horizontal gap
103
+ gap = element.left - chunk.right
104
+ max_gap = [chunk.width_of_space || chunk.width, element.width_of_space || element.width].compact.max
105
+ max_gap ||= element.width
106
+
107
+ gap <= max_gap * 0.5
108
+ end
109
+
110
+ def ruling_between?(chunk, element, ruling)
111
+ return false unless ruling.vertical?
112
+
113
+ ruling_x = ruling.x1
114
+ ruling_x > chunk.right && ruling_x < element.left &&
115
+ ruling.top <= [chunk.top, element.top].min &&
116
+ ruling.bottom >= [chunk.bottom, element.bottom].max
117
+ end
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ VERSION = '1.0.0'
5
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+
5
+ module Tabula
6
+ module Writers
7
+ # Writes tables in CSV format
8
+ class CSVWriter < Writer
9
+ # @param separator [String] field separator (default: comma)
10
+ # @param quote_char [String] quote character (default: double quote)
11
+ # @param force_quotes [Boolean] always quote fields (default: false)
12
+ def initialize(separator: ',', quote_char: '"', force_quotes: false, **options)
13
+ super(**options)
14
+ @separator = separator
15
+ @quote_char = quote_char
16
+ @force_quotes = force_quotes
17
+ end
18
+
19
+ # Write tables to an IO object
20
+ # @param tables [Array<Table>] tables to write
21
+ # @param io [IO] output destination
22
+ def write(tables, io)
23
+ csv_options = {
24
+ col_sep: @separator,
25
+ quote_char: @quote_char,
26
+ force_quotes: @force_quotes
27
+ }
28
+
29
+ tables.each_with_index do |table, idx|
30
+ # Add blank line between tables
31
+ io.puts if idx.positive?
32
+
33
+ csv = CSV.new(io, **csv_options)
34
+ table.to_a.each { |row| csv << row }
35
+ end
36
+ end
37
+
38
+ # Write tables to a string
39
+ # @param tables [Array<Table>] tables to write
40
+ # @return [String] CSV formatted output
41
+ def self.to_string(tables, **options)
42
+ require 'stringio'
43
+ io = StringIO.new
44
+ new(**options).write(tables, io)
45
+ io.string
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Tabula
6
+ module Writers
7
+ # Writes tables in JSON format
8
+ class JSONWriter < Writer
9
+ # @param pretty [Boolean] pretty-print JSON (default: false)
10
+ # @param include_metadata [Boolean] include table metadata (default: true)
11
+ def initialize(pretty: false, include_metadata: true, **options)
12
+ super(**options)
13
+ @pretty = pretty
14
+ @include_metadata = include_metadata
15
+ end
16
+
17
+ # Write tables to an IO object
18
+ # @param tables [Array<Table>] tables to write
19
+ # @param io [IO] output destination
20
+ def write(tables, io)
21
+ output = tables.map { |table| table_to_hash(table) }
22
+
23
+ if @pretty
24
+ io.puts JSON.pretty_generate(output)
25
+ else
26
+ io.puts JSON.generate(output)
27
+ end
28
+ end
29
+
30
+ private
31
+
32
+ def table_to_hash(table)
33
+ if @include_metadata
34
+ table.to_h
35
+ else
36
+ { data: table.to_a }
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ module Writers
5
+ # Writes tables in Markdown format (GitHub-flavored)
6
+ class MarkdownWriter < Writer
7
+ # @param alignment [Symbol] column alignment (:left, :center, :right, or nil for default)
8
+ def initialize(alignment: nil, **options)
9
+ super(**options)
10
+ @alignment = alignment
11
+ end
12
+
13
+ # Write tables to an IO object
14
+ # @param tables [Array<Table>] tables to write
15
+ # @param io [IO] output destination
16
+ def write(tables, io)
17
+ tables.each_with_index do |table, idx|
18
+ # Add blank line between tables
19
+ io.puts if idx.positive?
20
+
21
+ rows = table.to_a
22
+ next if rows.empty?
23
+
24
+ col_count = rows.map(&:size).max || 0
25
+ next if col_count.zero?
26
+
27
+ # Write header row (first row)
28
+ write_row(io, rows.first, col_count)
29
+
30
+ # Write separator row
31
+ write_separator(io, col_count)
32
+
33
+ # Write data rows
34
+ rows.drop(1).each do |row|
35
+ write_row(io, row, col_count)
36
+ end
37
+ end
38
+ end
39
+
40
+ private
41
+
42
+ def write_row(io, row, col_count)
43
+ cells = (0...col_count).map do |i|
44
+ escape_markdown(row[i].to_s)
45
+ end
46
+ io.puts "| #{cells.join(' | ')} |"
47
+ end
48
+
49
+ def write_separator(io, col_count)
50
+ separators = Array.new(col_count) do
51
+ case @alignment
52
+ when :left
53
+ ':---'
54
+ when :center
55
+ ':---:'
56
+ when :right
57
+ '---:'
58
+ else
59
+ '---'
60
+ end
61
+ end
62
+ io.puts "| #{separators.join(' | ')} |"
63
+ end
64
+
65
+ def escape_markdown(text)
66
+ # Escape pipe characters and normalize whitespace
67
+ text.gsub('|', '\\|').gsub(/\s+/, ' ').strip
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ module Writers
5
+ # Writes tables in TSV (Tab-Separated Values) format
6
+ class TSVWriter < Writer
7
+ # Write tables to an IO object
8
+ # @param tables [Array<Table>] tables to write
9
+ # @param io [IO] output destination
10
+ def write(tables, io)
11
+ tables.each_with_index do |table, idx|
12
+ # Add blank line between tables
13
+ io.puts if idx.positive?
14
+
15
+ table.to_a.each do |row|
16
+ # Escape tabs and newlines in cell values
17
+ escaped = row.map { |cell| escape_value(cell) }
18
+ io.puts escaped.join("\t")
19
+ end
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def escape_value(value)
26
+ return '' if value.nil?
27
+
28
+ value.to_s
29
+ .gsub("\t", '\\t')
30
+ .gsub("\n", '\\n')
31
+ .gsub("\r", '\\r')
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ module Writers
5
+ # Base class for table output writers
6
+ class Writer
7
+ # Write tables to an IO object
8
+ # @param tables [Array<Table>] tables to write
9
+ # @param io [IO] output destination
10
+ # @param options [Hash] writer-specific options
11
+ def self.write(tables, io = $stdout, **options)
12
+ new(**options).write(tables, io)
13
+ end
14
+
15
+ # Write tables to a string
16
+ # @param tables [Array<Table>] tables to write
17
+ # @param options [Hash] writer-specific options
18
+ # @return [String] formatted output
19
+ def self.to_string(tables, **options)
20
+ require 'stringio'
21
+ io = StringIO.new
22
+ write(tables, io, **options)
23
+ io.string
24
+ end
25
+
26
+ # @param options [Hash] writer options
27
+ def initialize(**options)
28
+ @options = options
29
+ end
30
+
31
+ # Write tables to an IO object
32
+ # @param tables [Array<Table>] tables to write
33
+ # @param io [IO] output destination
34
+ def write(tables, io)
35
+ raise NotImplementedError, 'Subclasses must implement #write'
36
+ end
37
+ end
38
+ end
39
+ end