tabula-extractor 0.0.1-java

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ coverage
6
+ InstalledFiles
7
+ lib/bundler/man
8
+ pkg
9
+ rdoc
10
+ spec/reports
11
+ test/tmp
12
+ test/version_tmp
13
+ tmp
14
+
15
+ # YARD artifacts
16
+ .yardoc
17
+ _yardoc
18
+ doc/
data/AUTHORS.md ADDED
@@ -0,0 +1,15 @@
1
+ Tabula was originally started by Manuel Aristarán in late 2012
2
+
3
+ The PRIMARY AUTHORS are (and/or have been):
4
+
5
+ * Manuel Aristarán - La Nación (Buenos Aires, Argentina), Knight-Mozilla OpenNews
6
+ * Mike Tigas - ProPublica, Knight-Mozilla OpenNews
7
+ * Jeremy Merrill - ProPublica
8
+ * David Frackman
9
+ * Travis Swicegood - Texas Tribune
10
+
11
+ Special thanks to these organizations:
12
+
13
+ * Knight-Mozilla OpenNews <http://www.mozillaopennews.org/>
14
+ * ProPublica <http://propublica.org>
15
+ * La Nación <http://www.lanacion.com.ar>
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in jruby-memcached.gemspec
4
+ gemspec
5
+
6
+ gem "rake"
data/LICENSE.md ADDED
@@ -0,0 +1,7 @@
1
+ Copyright (C) 2012-2013 Manuel Aristarán <jazzido@jazzido.com>
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/NOTICE.txt ADDED
@@ -0,0 +1,6 @@
1
+ Tabula
2
+ © 2012-2013 Manuel Aristarán. Available under MIT License. See `AUTHORS.md`
3
+ and `LICENSE.md`.
4
+
5
+ This product includes software (target/pdfbox-app-1.8.0.jar) developed at
6
+ The Apache Software Foundation (http://www.apache.org/).
data/README.md ADDED
@@ -0,0 +1,24 @@
1
+ tabula-extractor
2
+ ================
3
+
4
+ Extract tables from PDF files
5
+
6
+ ## Usage
7
+
8
+ ```
9
+ $ tabula --help
10
+ Tabula helps you extract tables from PDFs
11
+
12
+ Usage:
13
+ tabula [options] <pdf_file>
14
+ where [options] are:
15
+ --page, -p <i>: Page number (default: 1)
16
+ --area, -a <s>: Portion of the page to analyze (top, left, bottom,
17
+ right). Example: --area 269.875, 12.75, 790.5, 561.
18
+ Default is entire page
19
+ --format, -f <s>: Output format (CSV,TSV,HTML,JSON) (default: CSV)
20
+ --outfile, -o <s>: Write output to <file> instead of STDOUT (default: -)
21
+ --version, -v: Print version and exit
22
+ --help, -h: Show this message
23
+ ```
24
+
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env rake
2
+ require 'bundler'
3
+ require 'rake'
4
+ require 'rake/testtask'
5
+
6
+ Bundler::GemHelper.install_tasks
7
+
8
+ task :test do
9
+ Rake::TestTask.new do |t|
10
+ t.test_files = Dir.glob('test/*.rb')
11
+ t.verbose = true
12
+ end
13
+ end
data/bin/tabula ADDED
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env jruby
2
+ # encoding: utf-8
3
+ require 'trollop'
4
+ require_relative '../lib/tabula'
5
+
6
+ FORMATS = ['CSV', 'TSV', 'HTML', 'JSON']
7
+
8
+ def parse_command_line
9
+ opts = Trollop::options do
10
+ version "tabula #{Tabula::VERSION} (c) 2012-2013 Manuel Aristarán"
11
+ banner <<-EOS
12
+ Tabula helps you extract tables from PDFs
13
+
14
+ Usage:
15
+ tabula [options] <pdf_file>
16
+ where [options] are:
17
+ EOS
18
+
19
+ opt :page, 'Page number', :default => 1, :type => Integer
20
+ opt :area, 'Portion of the page to analyze (top, left, bottom, right). Example: --area 269.875, 12.75, 790.5, 561. Default is entire page', :type => String, :default => nil
21
+ opt :format, "Output format (#{FORMATS.join(",")})", :default => 'CSV'
22
+ opt :outfile, 'Write output to <file> instead of STDOUT', :default => '-'
23
+ end
24
+
25
+ if !opts[:area].nil?
26
+ unless opts[:area].split(',').size == 4 \
27
+ && opts[:area].split(',').all? { |x| x.strip =~ /(\d+\.?\d*)/ }
28
+ Trollop::die :area, "is invalid"
29
+ end
30
+ end
31
+ Trollop::die :format, "is unknown" unless FORMATS.include?(opts[:format])
32
+ Trollop::die "need one filename" if ARGV.empty?
33
+
34
+ pdf_filename = ARGV.shift
35
+ Trollop::die 'file does not exist' unless File.exists? pdf_filename
36
+
37
+ return opts, pdf_filename
38
+
39
+ end
40
+
41
+ def main
42
+
43
+ opts, filename = parse_command_line
44
+
45
+ extractor = Tabula::Extraction::CharacterExtractor.new(filename, [opts[:page]])
46
+ table = Tabula.make_table(extractor.extract.next.get_text(opts[:area].nil? ? nil : opts[:area].split(',').map(&:to_f)))
47
+ out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
48
+ Tabula::Writers.send(opts[:format].to_sym, table, out)
49
+ out.close
50
+ end
51
+
52
+ main
53
+
data/lib/tabula.rb ADDED
@@ -0,0 +1,5 @@
1
+ require_relative './tabula/version'
2
+ require_relative './tabula/entities'
3
+ require_relative './tabula/pdf_dump'
4
+ require_relative './tabula/table_extractor'
5
+ require_relative './tabula/writers'
@@ -0,0 +1,259 @@
1
+ module Tabula
2
+
3
+ class ZoneEntity
4
+ attr_accessor :top, :left, :width, :height
5
+
6
+ attr_accessor :texts
7
+
8
+ def initialize(top, left, width, height)
9
+ self.top = top
10
+ self.left = left
11
+ self.width = width
12
+ self.height = height
13
+ self.texts = []
14
+ end
15
+
16
+ def bottom
17
+ self.top + self.height
18
+ end
19
+
20
+ def right
21
+ self.left + self.width
22
+ end
23
+
24
+ # [x, y]
25
+ def midpoint
26
+ [self.left + (self.width / 2), self.top + (self.height / 2)]
27
+ end
28
+
29
+ def area
30
+ self.width * self.height
31
+ end
32
+
33
+ def merge!(other)
34
+ self.top = [self.top, other.top].min
35
+ self.left = [self.left, other.left].min
36
+ self.width = [self.right, other.right].max - left
37
+ self.height = [self.bottom, other.bottom].max - top
38
+ end
39
+
40
+ def horizontal_distance(other)
41
+ (other.left - self.right).abs
42
+ end
43
+
44
+ def vertical_distance(other)
45
+ (other.bottom - self.bottom).abs
46
+ end
47
+
48
+ # Roughly, detects if self and other belong to the same line
49
+ def vertically_overlaps?(other)
50
+ vertical_overlap = [0, [self.bottom, other.bottom].min - [self.top, other.top].max].max
51
+ vertical_overlap > 0
52
+ end
53
+
54
+ # detects if self and other belong to the same column
55
+ def horizontally_overlaps?(other)
56
+ horizontal_overlap = [0, [self.right, other.right].min - [self.left, other.left].max].max
57
+ horizontal_overlap > 0
58
+ end
59
+
60
+ def overlaps?(other, ratio_tolerance=0.00001)
61
+ self.overlap_ratio(other) > ratio_tolerance
62
+ end
63
+
64
+ def overlap_ratio(other)
65
+ intersection_width = [0, [self.right, other.right].min - [self.left, other.left].max].max
66
+ intersection_height = [0, [self.bottom, other.bottom].min - [self.top, other.top].max].max
67
+ intersection_area = [0, intersection_height * intersection_width].max
68
+
69
+ union_area = self.area + other.area - intersection_area
70
+ intersection_area / union_area
71
+ end
72
+
73
+ def to_h
74
+ hash = {}
75
+ [:top, :left, :width, :height].each do |m|
76
+ hash[m] = self.send(m)
77
+ end
78
+ hash
79
+ end
80
+
81
+ def to_json(options={})
82
+ self.to_h.to_json
83
+ end
84
+ end
85
+
86
+ class Page < ZoneEntity
87
+ attr_reader :rotation, :number
88
+
89
+ def initialize(width, height, rotation, number, texts=[])
90
+ super(0, 0, width, height)
91
+ @rotation = rotation
92
+ @number = number
93
+ self.texts = texts
94
+ end
95
+
96
+ # get text, optionally from a provided area in the page [top, left, bottom, right]
97
+ def get_text(area=nil)
98
+ area = [0, 0, width, height] if area.nil?
99
+ ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
100
+ self.texts.select { |t| t.overlaps? ze }
101
+ end
102
+
103
+ def to_json(options={})
104
+ { :width => self.width,
105
+ :height => self.height,
106
+ :number => self.number,
107
+ :rotation => self.rotation,
108
+ :texts => self.texts
109
+ }.to_json(options)
110
+ end
111
+
112
+ end
113
+
114
+ class TextElement < ZoneEntity
115
+ attr_accessor :font, :font_size, :text
116
+
117
+ CHARACTER_DISTANCE_THRESHOLD = 1.5
118
+
119
+ def initialize(top, left, width, height, font, font_size, text)
120
+ super(top, left, width, height)
121
+ self.font = font
122
+ self.font_size = font_size
123
+ self.text = text
124
+ end
125
+
126
+ # more or less returns True if distance < tolerance
127
+ def should_merge?(other)
128
+ raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
129
+ overlaps = self.vertically_overlaps?(other)
130
+
131
+ tolerance = ((self.font_size + other.font_size) / 2) * 0.25
132
+
133
+ overlaps or
134
+ (self.height == 0 and other.height != 0) or
135
+ (other.height == 0 and self.height != 0) and
136
+ self.horizontal_distance(other) < tolerance
137
+ end
138
+
139
+ # more or less returns True if (tolerance <= distance < CHARACTER_DISTANCE_THRESHOLD*tolerance)
140
+ def should_add_space?(other)
141
+ raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
142
+ overlaps = self.vertically_overlaps?(other)
143
+
144
+ tolerance = ((self.font_size + other.font_size) / 2) * 0.25
145
+
146
+ dist = self.horizontal_distance(other)
147
+ overlaps or
148
+ (self.height == 0 and other.height != 0) or
149
+ (other.height == 0 and self.height != 0) and
150
+ ((tolerance <= dist) and (dist < tolerance*CHARACTER_DISTANCE_THRESHOLD))
151
+ end
152
+
153
+ def merge!(other)
154
+ raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
155
+ # unless self.horizontally_overlaps?(other) or self.vertically_overlaps?(other)
156
+ # raise ArgumentError, "won't merge TextElements that don't overlap"
157
+ # end
158
+ if self.horizontally_overlaps?(other) and other.top < self.top
159
+ self.text = other.text + self.text
160
+ else
161
+ self.text << other.text
162
+ end
163
+ super(other)
164
+ end
165
+
166
+ def to_h
167
+ hash = super
168
+ [:font, :text].each do |m|
169
+ hash[m] = self.send(m)
170
+ end
171
+ hash
172
+ end
173
+ end
174
+
175
+
176
+ class Line < ZoneEntity
177
+ attr_accessor :text_elements
178
+
179
+ def initialize
180
+ self.text_elements = []
181
+ end
182
+
183
+ def <<(t)
184
+ if self.text_elements.size == 0
185
+ self.text_elements << t
186
+ self.top = t.top
187
+ self.left = t.left
188
+ self.width = t.width
189
+ self.height = t.height
190
+ else
191
+ if in_same_column = self.text_elements.find { |te| te.horizontally_overlaps?(t) }
192
+ in_same_column.merge!(t)
193
+ else
194
+ self.text_elements << t
195
+ self.merge!(t)
196
+ end
197
+ end
198
+ end
199
+
200
+
201
+ end
202
+
203
+ class Column < ZoneEntity
204
+ attr_accessor :text_elements
205
+
206
+ def initialize(left, width, text_elements=[])
207
+ super(0, left, width, 0)
208
+ @text_elements = text_elements
209
+ end
210
+
211
+ def <<(te)
212
+ self.text_elements << te
213
+ self.update_boundaries!(te)
214
+ self.text_elements.sort_by! { |t| t.top }
215
+ end
216
+
217
+ def update_boundaries!(text_element)
218
+ self.merge!(text_element)
219
+ end
220
+
221
+ # this column can be merged with other_column?
222
+ def contains?(other_column)
223
+ self.horizontally_overlaps?(other_column)
224
+ end
225
+
226
+ def average_line_distance
227
+ # avg distance between lines
228
+ # this might help to MERGE lines that are shouldn't be split
229
+ # e.g. cells with > 1 lines of text
230
+ 1.upto(self.text_elements.size - 1).map { |i|
231
+ self.text_elements[i].top - self.text_elements[i - 1].top
232
+ }.inject{ |sum, el| sum + el }.to_f / self.text_elements.size
233
+ end
234
+
235
+ def inspect
236
+ vars = (self.instance_variables - [:@text_elements]).map{ |v| "#{v}=#{instance_variable_get(v).inspect}" }
237
+ texts = self.text_elements.sort_by { |te| te.top }.map { |te| te.text }
238
+ "<#{self.class}: #{vars.join(', ')}, @text_elements=#{texts.join(', ')}>"
239
+ end
240
+
241
+ end
242
+
243
+ class Ruling < ZoneEntity
244
+ attr_accessor :color
245
+
246
+ def initialize(top, left, width, height, color)
247
+ super(top, left, width, height)
248
+ self.color = color
249
+ end
250
+
251
+ def to_h
252
+ hash = super
253
+ hash[:color] = self.color
254
+ hash
255
+ end
256
+
257
+ end
258
+
259
+ end
@@ -0,0 +1,118 @@
1
+ require 'observer'
2
+
3
+ require_relative './entities.rb'
4
+
5
+ require 'java'
6
+ require File.join(File.dirname(__FILE__), '../../target/pdfbox-app-1.8.0.jar')
7
+ java_import org.apache.pdfbox.pdfparser.PDFParser
8
+ java_import org.apache.pdfbox.pdmodel.PDDocument
9
+ java_import org.apache.pdfbox.util.PDFTextStripper
10
+
11
+ module Tabula
12
+ module Extraction
13
+ class TextExtractor < org.apache.pdfbox.util.PDFTextStripper
14
+
15
+ attr_accessor :characters, :fonts
16
+
17
+ PRINTABLE_RE = /[[:print:]]/
18
+
19
+ def initialize
20
+ super
21
+ self.fonts = {}
22
+ self.characters = []
23
+ self.setSortByPosition(true)
24
+ end
25
+
26
+ def clear!
27
+ self.characters = []; self.fonts = {}
28
+ end
29
+
30
+
31
+ def processTextPosition(text)
32
+ # return if text.getCharacter == ' '
33
+
34
+ # text_font = text.getFont
35
+ # text_size = text.getFontSize
36
+ # font_plus_size = self.fonts.select { |k, v| v == text_font }.first.first + "-" + text_size.to_i.to_s
37
+
38
+ # $fonts[$current_page].merge!({
39
+ # font_plus_size => { :family => text_font.getBaseFont, :size => text_size }
40
+ # })
41
+
42
+ # $page_contents[$current_page] += " <text top=\"%.2f\" left=\"%.2f\" width=\"%.2f\" height=\"%.2f\" font=\"#{font_plus_size}\" dir=\"#{text.getDir}\">#{text.getCharacter}</text>\n" % [text.getYDirAdj - text.getHeightDir, text.getXDirAdj, text.getWidthDirAdj, text.getHeightDir]
43
+
44
+ c = text.getCharacter
45
+ # probably not the fastest way of detecting printable chars
46
+ self.characters << text if c =~ PRINTABLE_RE
47
+ end
48
+ end
49
+
50
+ class PagesInfoExtractor
51
+ def initialize(pdf_filename)
52
+ raise Errno::ENOENT unless File.exists?(pdf_filename)
53
+ @pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil)
54
+ @all_pages = @pdf_file.getDocumentCatalog.getAllPages
55
+ end
56
+
57
+ def pages
58
+ Enumerator.new do |y|
59
+ begin
60
+ @all_pages.each_with_index do |page, i|
61
+ contents = page.getContents
62
+ next if contents.nil?
63
+ y.yield Tabula::Page.new(page.findCropBox.width,
64
+ page.findCropBox.height,
65
+ page.getRotation.to_i,
66
+ i+1)
67
+ end
68
+ ensure
69
+ @pdf_file.close
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+
76
+ class CharacterExtractor
77
+ include Observable
78
+
79
+ def initialize(pdf_filename, pages=[1])
80
+ raise Errno::ENOENT unless File.exists?(pdf_filename)
81
+ @pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil)
82
+ @all_pages = @pdf_file.getDocumentCatalog.getAllPages
83
+ @pages = pages
84
+ @extractor = TextExtractor.new
85
+ end
86
+
87
+ def extract
88
+ Enumerator.new do |y|
89
+ begin
90
+ @pages.each do |i|
91
+ page = @all_pages.get(i-1)
92
+ contents = page.getContents
93
+ next if contents.nil?
94
+ @extractor.clear!
95
+ @extractor.processStream(page, page.findResources, contents.getStream)
96
+
97
+ y.yield Tabula::Page.new(page.findCropBox.width,
98
+ page.findCropBox.height,
99
+ page.getRotation.to_i,
100
+ i+1,
101
+ @extractor.characters.map { |char|
102
+ Tabula::TextElement.new(char.getYDirAdj,
103
+ char.getXDirAdj,
104
+ char.getWidthDirAdj,
105
+ char.getHeightDir,
106
+ nil,
107
+ char.getFontSize,
108
+ char.getCharacter)
109
+ })
110
+ end
111
+ ensure
112
+ @pdf_file.close
113
+ end # begin
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,237 @@
1
+ require 'csv'
2
+
3
+ module Tabula
4
+ class TableExtractor
5
+ attr_accessor :text_elements, :options
6
+
7
+ DEFAULT_OPTIONS = {
8
+ :horizontal_rulings => [],
9
+ :vertical_rulings => [],
10
+ :merge_words => true,
11
+ :split_multiline_cells => false
12
+ }
13
+
14
+ def initialize(text_elements, options = {})
15
+ self.text_elements = text_elements
16
+ self.options = DEFAULT_OPTIONS.merge(options)
17
+ @merged = false
18
+ merge_words! if self.options[:merge_words]
19
+ end
20
+
21
+ def get_rows
22
+ hg = self.get_line_boundaries
23
+ hg.sort_by(&:top).map { |r| {'top' => r.top, 'bottom' => r.bottom, 'text' => r.texts} }
24
+ end
25
+
26
+ # TODO finish writing this method
27
+ # it should be analogous to get_line_boundaries
28
+ # (ie, take into account vertical ruling lines if available)
29
+ def group_by_columns
30
+ columns = []
31
+ tes = self.text_elements.sort_by(&:left)
32
+
33
+ # we don't have vertical rulings
34
+ tes.each do |te|
35
+ if column = columns.detect { |c| te.horizontally_overlaps?(c) }
36
+ column << te
37
+ else
38
+ columns << Column.new(te.left, te.width, [te])
39
+ end
40
+ end
41
+ columns
42
+ end
43
+
44
+ def get_columns
45
+ Tabula.group_by_columns(text_elements).map { |c|
46
+ {'left' => c.left, 'right' => c.right, 'width' => c.width}
47
+ }
48
+ end
49
+
50
+ def get_line_boundaries
51
+ boundaries = []
52
+
53
+ if self.options[:horizontal_rulings].empty?
54
+ # we don't have rulings
55
+ # iteratively grow boundaries to construct lines
56
+ self.text_elements.each do |te|
57
+ row = boundaries.detect { |l| l.vertically_overlaps?(te) }
58
+ ze = ZoneEntity.new(te.top, te.left, te.width, te.height)
59
+ if row.nil?
60
+ boundaries << ze
61
+ ze.texts << te.text
62
+ else
63
+ row.merge!(ze)
64
+ row.texts << te.text
65
+ end
66
+ end
67
+ else
68
+ self.options[:horizontal_rulings].sort_by!(&:top)
69
+ 1.upto(self.options[:horizontal_rulings].size - 1) do |i|
70
+ above = self.options[:horizontal_rulings][i - 1]
71
+ below = self.options[:horizontal_rulings][i]
72
+
73
+ # construct zone between a horizontal ruling and the next
74
+ ze = ZoneEntity.new(above.top,
75
+ [above.left, below.left].min,
76
+ [above.width, below.width].max,
77
+ below.top - above.top)
78
+
79
+ # skip areas shorter than some threshold
80
+ # TODO: this should be the height of the shortest character, or something like that
81
+ next if ze.height < 2
82
+
83
+ boundaries << ze
84
+ end
85
+ end
86
+ boundaries
87
+ end
88
+
89
+ private
90
+
91
+ def merge_words!
92
+ return self.text_elements if @merged # only merge once. awful hack.
93
+ @merged = true
94
+ current_word_index = i = 0
95
+ char1 = self.text_elements[i]
96
+
97
+ while i < self.text_elements.size-1 do
98
+
99
+ char2 = self.text_elements[i+1]
100
+
101
+ next if char2.nil? or char1.nil?
102
+
103
+ if self.text_elements[current_word_index].should_merge?(char2)
104
+ self.text_elements[current_word_index].merge!(char2)
105
+ char1 = char2
106
+ self.text_elements[i+1] = nil
107
+ else
108
+ # is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
109
+ if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
110
+ self.text_elements[current_word_index].text += " "
111
+ end
112
+ current_word_index = i+1
113
+ end
114
+ i += 1
115
+ end
116
+ return self.text_elements.compact!
117
+ end
118
+ end
119
+
120
+ # TODO next four module methods are deprecated
121
+ def Tabula.group_by_columns(text_elements, merge_words=false)
122
+ TableExtractor.new(text_elements, :merge_words => merge_words).group_by_columns
123
+ end
124
+
125
+ def Tabula.get_line_boundaries(text_elements)
126
+ TableExtractor.new(text_elements).get_line_boundaries
127
+ end
128
+
129
+ def Tabula.get_columns(text_elements, merge_words=true)
130
+ TableExtractor.new(text_elements, :merge_words => merge_words).get_columns
131
+ end
132
+
133
+ def Tabula.get_rows(text_elements, merge_words=true)
134
+ TableExtractor.new(text_elements, :merge_words => merge_words).get_rows
135
+ end
136
+
137
+ def Tabula.lines_to_csv(lines)
138
+ CSV.generate { |csv|
139
+ lines.each { |l|
140
+ csv << l.map { |c| c.text.strip }
141
+ }
142
+ }
143
+ end
144
+
145
+ ONLY_SPACES_RE = Regexp.new('^\s+$')
146
+
147
+ # Returns an array of Tabula::Line
148
+ def Tabula.make_table(text_elements, options={})
149
+ extractor = TableExtractor.new(text_elements, options)
150
+
151
+ # group by lines
152
+ lines = []
153
+ line_boundaries = extractor.get_line_boundaries
154
+
155
+ # find all the text elements
156
+ # contained within each detected line (table row) boundary
157
+ line_boundaries.each { |lb|
158
+ line = Line.new
159
+
160
+ line_members = text_elements.find_all { |te|
161
+ te.vertically_overlaps?(lb)
162
+ }
163
+
164
+ text_elements -= line_members
165
+
166
+ line_members.sort_by(&:left).each { |te|
167
+ # skip text_elements that only contain spaces
168
+ next if te.text =~ ONLY_SPACES_RE
169
+ line << te
170
+ }
171
+
172
+ lines << line if line.text_elements.size > 0
173
+ }
174
+
175
+ lines.sort_by!(&:top)
176
+
177
+ columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq).sort_by(&:left)
178
+
179
+ # # insert empty cells if needed
180
+ lines.each_with_index { |l, line_index|
181
+ next if l.text_elements.nil?
182
+ l.text_elements.compact! # TODO WHY do I have to do this?
183
+ l.text_elements.uniq! # TODO WHY do I have to do this?
184
+ l.text_elements.sort_by!(&:left)
185
+
186
+ # l.text_elements = Tabula.merge_words(l.text_elements)
187
+
188
+ next unless l.text_elements.size < columns.size
189
+
190
+ columns.each_with_index do |c, i|
191
+ if (i > l.text_elements.size - 1) or !l.text_elements(&:left)[i].nil? and !c.text_elements.include?(l.text_elements[i])
192
+ l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, ''))
193
+ end
194
+ end
195
+ }
196
+
197
+ # # merge elements that are in the same column
198
+ columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq)
199
+
200
+ lines.each_with_index do |l, line_index|
201
+ next if l.text_elements.nil?
202
+
203
+ (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2|
204
+ next if l.text_elements[t1].nil? or l.text_elements[t2].nil?
205
+
206
+ # if same column...
207
+ if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
208
+ == columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
209
+ if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
210
+ l.text_elements[t1].merge!(l.text_elements[t2])
211
+ l.text_elements[t2] = nil
212
+ else
213
+ l.text_elements[t2].merge!(l.text_elements[t1])
214
+ l.text_elements[t1] = nil
215
+ end
216
+ end
217
+ end
218
+
219
+ l.text_elements.compact!
220
+ end
221
+
222
+ # remove duplicate lines
223
+ # TODO this shouldn't have happened here, check why we have to do
224
+ # this (maybe duplication is happening in the column merging phase?)
225
+ (0..lines.size - 2).each do |i|
226
+ next if lines[i].nil?
227
+ # if any of the elements on the next line is duplicated, kill
228
+ # the next line
229
+ if (0..lines[i].text_elements.size-1).any? { |j| lines[i].text_elements[j] == lines[i+1].text_elements[j] }
230
+ lines[i+1] = nil
231
+ end
232
+ end
233
+ lines.compact.map { |line|
234
+ line.text_elements.sort_by(&:left)
235
+ }
236
+ end
237
+ end
@@ -0,0 +1,3 @@
1
+ module Tabula
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,50 @@
1
+ require 'algorithms'
2
+ module Tabula
3
+ module Whitespace
4
+
5
+ # Detect whitespace in a document (not yet used in Tabula)
6
+ # Described in "Two Geometric Algorithms for layout analysis" (Thomas Breuer)
7
+ # http://pdf.aminer.org/000/140/219/two_geometric_algorithms_for_layout_analysis.pdf
8
+
9
+ def self.find_closest(text_elements, x, y)
10
+ text_elements.sort_by { |te|
11
+ Math.sqrt((x - te.midpoint[0]) ** 2 + (y - te.midpoint[1]) ** 2)
12
+ }.first
13
+ end
14
+
15
+
16
+ def self.find_whitespace(text_elements, bounds)
17
+ queue = Containers::PriorityQueue.new
18
+ queue.push([bounds, text_elements], bounds.width * bounds.height)
19
+ rv = []
20
+
21
+
22
+ while !queue.empty?
23
+ r, obstacles = queue.pop
24
+ if obstacles.empty?
25
+ return r
26
+ end
27
+
28
+ pivot = self.find_closest(obstacles, *r.midpoint)
29
+
30
+ subrectangles = [
31
+ ZoneEntity.new(r.top, pivot.right, r.right - pivot.right, pivot.top - r.top),
32
+ ZoneEntity.new(r.top, r.left, pivot.left - r.left, pivot.top - r.top),
33
+ ZoneEntity.new(pivot.bottom, r.left, pivot.left - r.left, r.bottom - pivot.bottom),
34
+ ZoneEntity.new(pivot.bottom, pivot.right, r.right - pivot.right, r.bottom - pivot.bottom)
35
+ ]
36
+ subrectangles.each do |sub_r|
37
+ obs = obstacles.select { |s|
38
+ s.overlaps?(sub_r)
39
+ }
40
+ if obs.empty?
41
+ rv << sub_r
42
+ else
43
+ queue.push([sub_r, obs], sub_r.width * sub_r.height)
44
+ end
45
+ end
46
+ end
47
+ return rv
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,30 @@
1
+ require 'csv'
2
+ require 'json'
3
+
4
+ module Tabula
5
+ module Writers
6
+
7
+ def Writers.CSV(lines, output=$stdout)
8
+ lines.each { |l|
9
+ output.write CSV.generate_line(l.map(&:text), row_sep: "\r\n")
10
+ }
11
+ end
12
+
13
+ def Writers.JSON(lines, output=$stdout)
14
+ output.write lines.to_json
15
+ end
16
+
17
+ def Writers.TSV(lines, output=$stdout)
18
+ tsv_string = lines.each { |l|
19
+ output.write(l.map(&:text).join("\t") + '\n')
20
+ }
21
+ end
22
+
23
+
24
+ def Writers.HTML(lines, output=$stdout)
25
+ raise "not implemented"
26
+ end
27
+
28
+
29
+ end
30
+ end
@@ -0,0 +1,26 @@
1
+ # encoding: utf-8
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require 'tabula/version'
4
+
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "tabula-extractor"
8
+ s.version = Tabula::VERSION
9
+ s.authors = ["Manuel Aristarán"]
10
+ s.email = ["manuel@jazzido.com"]
11
+ s.homepage = "https://github.com/jazzido/tabula-extractor"
12
+ s.summary = %q{extract tables from PDF files}
13
+ s.description = %q{extract tables from PDF files}
14
+
15
+ s.platform = 'java'
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_development_dependency 'minitest'
23
+ s.add_development_dependency 'bundler', '>= 1.3.5'
24
+
25
+ s.add_runtime_dependency "trollop", ["~> 2.0"]
26
+ end
Binary file
Binary file
data/test/data/gre.pdf ADDED
Binary file
Binary file
data/test/tests.rb ADDED
@@ -0,0 +1,72 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'minitest/autorun'
3
+
4
+ require_relative '../lib/tabula'
5
+
6
+ def lines_to_array(lines)
7
+ lines.map { |l|
8
+ l.map { |te| te.text }
9
+ }
10
+ end
11
+
12
+ class TestPagesInfoExtractor < MiniTest::Unit::TestCase
13
+ def test_pages_info_extractor
14
+ extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
15
+
16
+ i = 0
17
+ extractor.pages.each do |page|
18
+ assert_instance_of Tabula::Page, page
19
+ i += 1
20
+ end
21
+ assert_equal 2, i
22
+ end
23
+ end
24
+
25
+
26
+ class TestDumper < MiniTest::Unit::TestCase
27
+
28
+ def test_extractor
29
+ extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
30
+ page = extractor.extract.first
31
+ assert_instance_of Tabula::Page, page
32
+ end
33
+
34
+ def test_get_by_area
35
+
36
+ # http://localhost:8080/debug/418b1d5698e5c7b724551d9610c071ab3063275c/characters?x1=57.921428571428564&x2=290.7&y1=107.1&y2=394.52142857142854&page=1&use_lines=false
37
+ extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
38
+ characters = extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
39
+ assert_equal characters.size, 206
40
+ end
41
+ end
42
+
43
+ class TestExtractor < MiniTest::Unit::TestCase
44
+
45
+ def test_table_extraction_1
46
+ character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
47
+ characters = character_extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
48
+ table = lines_to_array Tabula.make_table(characters)
49
+ expected = [["Prior Scale ", "New Scale ", "% Rank* "], ["800 ", "170 ", "99 "], ["790 ", "170 ", "99 "], ["780 ", "170 ", "99 "], ["770 ", "170 ", "99 "], ["760 ", "170 ", "99 "], ["750 ", "169 ", "99 "], ["740 ", "169 ", "99 "], ["730 ", "168 ", "98 "], ["720 ", "168 ", "98 "], ["710 ", "167 ", "97 "], ["700 ", "166 ", "96 "], ["690 ", "165 ", "95 "], ["680 ", "165 ", "95 "], ["670 ", "164 ", "93 "], ["660 ", "164 ", "93 "], ["650 ", "163 ", "91 "]]
50
+ assert_equal expected, table
51
+ end
52
+
53
+ def test_diputados_voting_record
54
+ character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)))
55
+ characters = character_extractor.extract.next.get_text([269.875, 12.75, 790.5, 561])
56
+
57
+ expected = [["Apellido y Nombre", "Bloque político", "Provincia", ""], ["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
58
+
59
+ assert_equal expected, lines_to_array(Tabula.make_table(characters))
60
+ end
61
+
62
+ # TODO Spaces inserted in words - fails
63
+ def test_bo_page24
64
+ character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))
65
+ characters = character_extractor.extract.next.get_text([435.625, 53.125, 570.7142857142857, 810.5357142857142])
66
+
67
+ expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B.MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
68
+ assert_equal expected, lines_to_array(Tabula.make_table(characters))
69
+ end
70
+
71
+
72
+ end
metadata ADDED
@@ -0,0 +1,133 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tabula-extractor
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: java
7
+ authors:
8
+ - Manuel Aristarán
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-05-09 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: minitest
16
+ version_requirements: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: !binary |-
21
+ MA==
22
+ none: false
23
+ requirement: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: !binary |-
28
+ MA==
29
+ none: false
30
+ prerelease: false
31
+ type: :development
32
+ - !ruby/object:Gem::Dependency
33
+ name: bundler
34
+ version_requirements: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: 1.3.5
39
+ none: false
40
+ requirement: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: 1.3.5
45
+ none: false
46
+ prerelease: false
47
+ type: :development
48
+ - !ruby/object:Gem::Dependency
49
+ name: trollop
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2.0'
55
+ none: false
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '2.0'
61
+ none: false
62
+ prerelease: false
63
+ type: :runtime
64
+ description: extract tables from PDF files
65
+ email:
66
+ - manuel@jazzido.com
67
+ executables:
68
+ - tabula
69
+ extensions: []
70
+ extra_rdoc_files: []
71
+ files:
72
+ - ".gitignore"
73
+ - AUTHORS.md
74
+ - Gemfile
75
+ - LICENSE.md
76
+ - NOTICE.txt
77
+ - README.md
78
+ - Rakefile
79
+ - bin/tabula
80
+ - lib/tabula.rb
81
+ - lib/tabula/entities.rb
82
+ - lib/tabula/pdf_dump.rb
83
+ - lib/tabula/table_extractor.rb
84
+ - lib/tabula/version.rb
85
+ - lib/tabula/whitespace.rb
86
+ - lib/tabula/writers.rb
87
+ - tabula-extractor.gemspec
88
+ - target/pdfbox-app-1.8.0.jar
89
+ - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
90
+ - test/data/argentina_diputados_voting_record.pdf
91
+ - test/data/bo_page24.pdf
92
+ - test/data/gre.pdf
93
+ - test/data/tabla_subsidios.pdf
94
+ - test/tests.rb
95
+ homepage: https://github.com/jazzido/tabula-extractor
96
+ licenses: []
97
+ post_install_message:
98
+ rdoc_options: []
99
+ require_paths:
100
+ - lib
101
+ required_ruby_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ segments:
106
+ - 0
107
+ hash: 2
108
+ version: !binary |-
109
+ MA==
110
+ none: false
111
+ required_rubygems_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ segments:
116
+ - 0
117
+ hash: 2
118
+ version: !binary |-
119
+ MA==
120
+ none: false
121
+ requirements: []
122
+ rubyforge_project:
123
+ rubygems_version: 1.8.24
124
+ signing_key:
125
+ specification_version: 3
126
+ summary: extract tables from PDF files
127
+ test_files:
128
+ - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
129
+ - test/data/argentina_diputados_voting_record.pdf
130
+ - test/data/bo_page24.pdf
131
+ - test/data/gre.pdf
132
+ - test/data/tabla_subsidios.pdf
133
+ - test/tests.rb