tabula-extractor 0.6.6-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS.md +1 -0
  3. data/README.md +27 -11
  4. data/bin/tabula +61 -19
  5. data/ext/liblsd-linux32.so +0 -0
  6. data/ext/liblsd-linux64.so +0 -0
  7. data/ext/liblsd.dll +0 -0
  8. data/ext/liblsd.dylib +0 -0
  9. data/ext/liblsd64.dll +0 -0
  10. data/ext/lsd.c +137 -137
  11. data/ext/lsd.h +9 -9
  12. data/lib/tabula.rb +20 -3
  13. data/lib/tabula/core_ext.rb +261 -0
  14. data/lib/tabula/entities.rb +11 -456
  15. data/lib/tabula/entities/cell.rb +42 -0
  16. data/lib/tabula/entities/has_cells.rb +244 -0
  17. data/lib/tabula/entities/line.rb +39 -0
  18. data/lib/tabula/entities/page.rb +269 -0
  19. data/lib/tabula/entities/page_area.rb +7 -0
  20. data/lib/tabula/entities/ruling.rb +300 -0
  21. data/lib/tabula/entities/spreadsheet.rb +92 -0
  22. data/lib/tabula/entities/table.rb +81 -0
  23. data/lib/tabula/entities/text_chunk.rb +114 -0
  24. data/lib/tabula/entities/text_element.rb +112 -0
  25. data/lib/tabula/entities/zone_entity.rb +57 -0
  26. data/lib/tabula/extraction.rb +327 -0
  27. data/lib/tabula/line_segment_detector.rb +9 -7
  28. data/lib/tabula/pdf_line_extractor.rb +319 -0
  29. data/lib/tabula/pdf_render.rb +1 -5
  30. data/lib/tabula/spreadsheet_extractor.rb +52 -0
  31. data/lib/tabula/table_extractor.rb +50 -348
  32. data/lib/tabula/table_guesser.rb +21 -23
  33. data/lib/tabula/version.rb +1 -1
  34. data/lib/tabula/writers.rb +5 -6
  35. data/tabula-extractor.gemspec +1 -0
  36. data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
  37. data/test/data/47008204D_USA.page4.pdf +0 -0
  38. data/test/data/560015757GV_China.page1.pdf +0 -0
  39. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  40. data/test/data/S2MNCEbirdisland.pdf +0 -0
  41. data/test/data/campaign_donors.pdf +0 -0
  42. data/test/data/frx_2012_disclosure.tsv +88 -0
  43. data/test/data/no_tables.pdf +0 -0
  44. data/test/data/puertos1.pdf +0 -0
  45. data/test/data/spanning_cells.csv +21 -0
  46. data/test/data/spanning_cells.pdf +0 -0
  47. data/test/data/strongschools.pdf +0 -0
  48. data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
  49. data/test/data/vietnam3.pdf +0 -0
  50. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  51. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  52. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  53. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  54. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  55. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  56. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  57. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  58. data/test/heuristic.rb +50 -0
  59. data/test/test_bin_tabula.sh +7 -0
  60. data/test/tests.rb +476 -63
  61. metadata +79 -28
  62. data/lib/geom/point.rb +0 -21
  63. data/lib/geom/rectangle.rb +0 -101
  64. data/lib/geom/segment.rb +0 -82
  65. data/lib/tabula/pdf_dump.rb +0 -132
  66. data/lib/tabula/whitespace.rb +0 -50
  67. data/vertical_rulings_bug.rb +0 -29
metadata CHANGED
@@ -1,8 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 0.6.6
4
+ version: 0.7.0
6
5
  platform: java
7
6
  authors:
8
7
  - Manuel Aristarán
@@ -11,7 +10,7 @@ authors:
11
10
  autorequire:
12
11
  bindir: bin
13
12
  cert_chain: []
14
- date: 2013-08-23 00:00:00.000000000 Z
13
+ date: 2014-01-07 00:00:00.000000000 Z
15
14
  dependencies:
16
15
  - !ruby/object:Gem::Dependency
17
16
  name: minitest
@@ -20,13 +19,11 @@ dependencies:
20
19
  - - '>='
21
20
  - !ruby/object:Gem::Version
22
21
  version: '0'
23
- none: false
24
22
  requirement: !ruby/object:Gem::Requirement
25
23
  requirements:
26
24
  - - '>='
27
25
  - !ruby/object:Gem::Version
28
26
  version: '0'
29
- none: false
30
27
  prerelease: false
31
28
  type: :development
32
29
  - !ruby/object:Gem::Dependency
@@ -36,13 +33,11 @@ dependencies:
36
33
  - - '>='
37
34
  - !ruby/object:Gem::Version
38
35
  version: 1.3.4
39
- none: false
40
36
  requirement: !ruby/object:Gem::Requirement
41
37
  requirements:
42
38
  - - '>='
43
39
  - !ruby/object:Gem::Version
44
40
  version: 1.3.4
45
- none: false
46
41
  prerelease: false
47
42
  type: :development
48
43
  - !ruby/object:Gem::Dependency
@@ -52,13 +47,25 @@ dependencies:
52
47
  - - '>='
53
48
  - !ruby/object:Gem::Version
54
49
  version: '0'
55
- none: false
56
50
  requirement: !ruby/object:Gem::Requirement
57
51
  requirements:
58
52
  - - '>='
59
53
  - !ruby/object:Gem::Version
60
54
  version: '0'
61
- none: false
55
+ prerelease: false
56
+ type: :development
57
+ - !ruby/object:Gem::Dependency
58
+ name: pry
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirement: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
62
69
  prerelease: false
63
70
  type: :development
64
71
  - !ruby/object:Gem::Dependency
@@ -68,13 +75,11 @@ dependencies:
68
75
  - - ~>
69
76
  - !ruby/object:Gem::Version
70
77
  version: '2.0'
71
- none: false
72
78
  requirement: !ruby/object:Gem::Requirement
73
79
  requirements:
74
80
  - - ~>
75
81
  - !ruby/object:Gem::Version
76
82
  version: '2.0'
77
- none: false
78
83
  prerelease: false
79
84
  type: :runtime
80
85
  description: extract tables from PDF files
@@ -109,34 +114,65 @@ files:
109
114
  - ext/liblsd64.dll
110
115
  - ext/lsd.c
111
116
  - ext/lsd.h
112
- - lib/geom/point.rb
113
- - lib/geom/rectangle.rb
114
- - lib/geom/segment.rb
115
117
  - lib/tabula.rb
116
118
  - lib/tabula/core_ext.rb
117
119
  - lib/tabula/entities.rb
120
+ - lib/tabula/entities/cell.rb
121
+ - lib/tabula/entities/has_cells.rb
122
+ - lib/tabula/entities/line.rb
123
+ - lib/tabula/entities/page.rb
124
+ - lib/tabula/entities/page_area.rb
125
+ - lib/tabula/entities/ruling.rb
126
+ - lib/tabula/entities/spreadsheet.rb
127
+ - lib/tabula/entities/table.rb
128
+ - lib/tabula/entities/text_chunk.rb
129
+ - lib/tabula/entities/text_element.rb
130
+ - lib/tabula/entities/zone_entity.rb
131
+ - lib/tabula/extraction.rb
118
132
  - lib/tabula/line_segment_detector.rb
119
- - lib/tabula/pdf_dump.rb
133
+ - lib/tabula/pdf_line_extractor.rb
120
134
  - lib/tabula/pdf_render.rb
135
+ - lib/tabula/spreadsheet_extractor.rb
121
136
  - lib/tabula/table_extractor.rb
122
137
  - lib/tabula/table_guesser.rb
123
138
  - lib/tabula/version.rb
124
- - lib/tabula/whitespace.rb
125
139
  - lib/tabula/writers.rb
126
140
  - tabula-extractor.gemspec
127
141
  - target/pdfbox-app-2.0.0-SNAPSHOT.jar
142
+ - test/data/47008204D_USA.page4.pdf
143
+ - test/data/560015757GV_China.page1.pdf
128
144
  - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
145
+ - test/data/GSK_2012_Q4.page437.pdf
146
+ - test/data/S2MNCEbirdisland.pdf
129
147
  - test/data/argentina_diputados_voting_record.pdf
130
148
  - test/data/bo_page24.pdf
149
+ - test/data/campaign_donors.pdf
131
150
  - test/data/frx_2012_disclosure.pdf
151
+ - test/data/frx_2012_disclosure.tsv
132
152
  - test/data/gre.pdf
153
+ - test/data/no_tables.pdf
154
+ - test/data/puertos1.pdf
155
+ - test/data/spanning_cells.csv
156
+ - test/data/spanning_cells.pdf
157
+ - test/data/strongschools.pdf
133
158
  - test/data/tabla_subsidios.pdf
159
+ - test/data/vertical_rulings_bug.pdf
160
+ - test/data/vietnam3.pdf
161
+ - test/heuristic-test-set/original/560015757GV_China.page1.pdf
162
+ - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
163
+ - test/heuristic-test-set/original/bo_page24.pdf
164
+ - test/heuristic-test-set/original/campaign_donors.pdf
165
+ - test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
166
+ - test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
167
+ - test/heuristic-test-set/spreadsheet/strongschools.pdf
168
+ - test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
169
+ - test/heuristic.rb
170
+ - test/test_bin_tabula.sh
134
171
  - test/tests.rb
135
- - vertical_rulings_bug.pdf
136
- - vertical_rulings_bug.rb
137
172
  homepage: https://github.com/jazzido/tabula-extractor
138
173
  licenses:
139
174
  - MIT
175
+ metadata: {}
140
176
  post_install_message:
141
177
  rdoc_options: []
142
178
  require_paths:
@@ -145,31 +181,46 @@ required_ruby_version: !ruby/object:Gem::Requirement
145
181
  requirements:
146
182
  - - '>='
147
183
  - !ruby/object:Gem::Version
148
- segments:
149
- - 0
150
- hash: 2
151
184
  version: '0'
152
- none: false
153
185
  required_rubygems_version: !ruby/object:Gem::Requirement
154
186
  requirements:
155
187
  - - '>='
156
188
  - !ruby/object:Gem::Version
157
- segments:
158
- - 0
159
- hash: 2
160
189
  version: '0'
161
- none: false
162
190
  requirements: []
163
191
  rubyforge_project:
164
- rubygems_version: 1.8.24
192
+ rubygems_version: 2.1.9
165
193
  signing_key:
166
- specification_version: 3
194
+ specification_version: 4
167
195
  summary: extract tables from PDF files
168
196
  test_files:
197
+ - test/data/47008204D_USA.page4.pdf
198
+ - test/data/560015757GV_China.page1.pdf
169
199
  - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
200
+ - test/data/GSK_2012_Q4.page437.pdf
201
+ - test/data/S2MNCEbirdisland.pdf
170
202
  - test/data/argentina_diputados_voting_record.pdf
171
203
  - test/data/bo_page24.pdf
204
+ - test/data/campaign_donors.pdf
172
205
  - test/data/frx_2012_disclosure.pdf
206
+ - test/data/frx_2012_disclosure.tsv
173
207
  - test/data/gre.pdf
208
+ - test/data/no_tables.pdf
209
+ - test/data/puertos1.pdf
210
+ - test/data/spanning_cells.csv
211
+ - test/data/spanning_cells.pdf
212
+ - test/data/strongschools.pdf
174
213
  - test/data/tabla_subsidios.pdf
214
+ - test/data/vertical_rulings_bug.pdf
215
+ - test/data/vietnam3.pdf
216
+ - test/heuristic-test-set/original/560015757GV_China.page1.pdf
217
+ - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
218
+ - test/heuristic-test-set/original/bo_page24.pdf
219
+ - test/heuristic-test-set/original/campaign_donors.pdf
220
+ - test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
221
+ - test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
222
+ - test/heuristic-test-set/spreadsheet/strongschools.pdf
223
+ - test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
224
+ - test/heuristic.rb
225
+ - test/test_bin_tabula.sh
175
226
  - test/tests.rb
data/lib/geom/point.rb DELETED
@@ -1,21 +0,0 @@
1
- #
2
- # Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
3
- # MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
4
- #
5
-
6
-
7
- module Geometry
8
- class Point < Struct.new(:x, :y)
9
- def self.new_by_array(array)
10
- self.new(array[0], array[1])
11
- end
12
-
13
- def ==(another_point)
14
- x === another_point.x && y === another_point.y
15
- end
16
- end
17
- end
18
-
19
- def Point(x, y)
20
- Geometry::Point.new(x, y)
21
- end
@@ -1,101 +0,0 @@
1
- #
2
- # Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
3
- # MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
4
- #
5
-
6
-
7
- module Geometry
8
- class Rectangle < Struct.new(:point1, :point2)
9
- SIMILARITY_DIVISOR = 20
10
-
11
- def Rectangle.unionize(non_overlapping_rectangles, next_rect)
12
- #if next_rect doesn't overlap any of non_overlapping_rectangles
13
- if (overlapping = non_overlapping_rectangles.select{|r| next_rect.overlaps? r}) && !non_overlapping_rectangles.empty?
14
- #remove all of those that it overlaps from non_overlapping_rectangles and
15
- non_overlapping_rectangles -= overlapping
16
- #add to non_overlapping_rectangles the bounding box of the overlapping rectangles.
17
- non_overlapping_rectangles << overlapping.inject(next_rect){|memo, overlap| memo.bounding_box(overlap) }
18
-
19
- else
20
- non_overlapping_rectangles << next_rect
21
- end
22
- end
23
-
24
- def self.new_by_x_y_dims(x, y, width, height)
25
- self.new( Point.new_by_array([x, y]), Point.new_by_array([x + width, y + height]) )
26
- end
27
-
28
- def x
29
- [point1.x, point2.x].min
30
- end
31
-
32
- alias_method :left, :x
33
-
34
- def y
35
- #puts "y: [#{point1.y} #{point2.y}].min"
36
- [point1.y, point2.y].min
37
- end
38
-
39
- alias_method :top, :y
40
-
41
- def x2
42
- [point1.x, point2.x].max
43
- end
44
-
45
- alias_method :right, :x2
46
-
47
- def y2
48
- #puts "y2: [#{point1.y} #{point2.y}].max"
49
- [point1.y, point2.y].max
50
- end
51
-
52
- alias_method :bottom, :y2
53
-
54
-
55
- def width
56
- (point1.x - point2.x).abs
57
- end
58
-
59
- def height
60
- (point1.y - point2.y).abs
61
- end
62
-
63
- def area
64
- self.width * self.height
65
- end
66
-
67
- def similarity_hash
68
- [self.x.to_i / SIMILARITY_DIVISOR, self.y.to_i / SIMILARITY_DIVISOR, self.width.to_i / SIMILARITY_DIVISOR, self.height.to_i / SIMILARITY_DIVISOR].to_s
69
- end
70
-
71
- def dims(*format)
72
- if format
73
- format.map{|method| self.send(method)}
74
- else
75
- [self.x, self.y, self.width, self.height]
76
- end
77
- end
78
-
79
- def contains?(other_x, other_y)
80
- (other_x <= x2 && other_x >= x ) && (other_y <= y2 && other_y > y)
81
- end
82
-
83
- def overlaps?(other_rect)
84
- return contains?(other_rect.x, other_rect.y) || contains?(other_rect.x2, other_rect.y2) ||
85
- contains?(other_rect.x, other_rect.y2) || contains?(other_rect.x2, other_rect.y) ||
86
- other_rect.contains?(x, y) || other_rect.contains?(x2, y2) ||
87
- other_rect.contains?(x, y2) || other_rect.contains?(x2, y)
88
- end
89
-
90
- def bounding_box(other_rect)
91
- #new rect with bounding box of these two
92
- new_x1 = [x, other_rect.x].min
93
- new_y1 = [x, other_rect.y].min
94
- new_x2 = [x2, other_rect.x2].max
95
- new_y2 = [y2, other_rect.y2].max
96
- new_width = (new_x2 - new_x1).abs
97
- new_height = (new_y2 - new_y1).abs
98
- Rectangle.new_by_x_y_dims(new_x1, new_y1, new_width, new_height)
99
- end
100
- end
101
- end
data/lib/geom/segment.rb DELETED
@@ -1,82 +0,0 @@
1
- #
2
- # Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
3
- # MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
4
- #
5
-
6
-
7
- module Geometry
8
- include Math
9
- extend Math
10
-
11
- def Geometry.distance(point1, point2)
12
- hypot point1.x - point2.x, point1.y - point2.y
13
- end
14
-
15
-
16
- class Segment < Struct.new(:point1, :point2)
17
- def self.new_by_arrays(point1_coordinates, point2_coordinates)
18
- self.new(Point.new_by_array(point1_coordinates),
19
- Point.new_by_array(point2_coordinates))
20
- end
21
-
22
- def scale!(scale_factor)
23
- self.point1.x = self.point1.x * scale_factor
24
- self.point1.y = self.point1.y * scale_factor
25
- self.point2.x = self.point2.x * scale_factor
26
- self.point2.y = self.point2.y * scale_factor
27
- end
28
-
29
- def vertical?
30
- point1.x == point2.x
31
- end
32
-
33
- def horizontal?
34
- point1.y == point2.y
35
- end
36
-
37
- def leftmost_endpoint
38
- ((point1.x <=> point2.x) == -1) ? point1 : point2
39
- end
40
-
41
- def rightmost_endpoint
42
- ((point1.x <=> point2.x) == 1) ? point1 : point2
43
- end
44
-
45
- def topmost_endpoint
46
- ((point1.y <=> point2.y) == 1) ? point1 : point2
47
- end
48
-
49
- def bottommost_endpoint
50
- ((point1.y <=> point2.y) == -1) ? point1 : point2
51
- end
52
-
53
- def top
54
- topmost_endpoint.y
55
- end
56
-
57
- def bottom
58
- bottommost_endpoint.y
59
- end
60
- def width
61
- (left - right).abs
62
- end
63
- def height
64
- (bottom - top).abs
65
- end
66
-
67
- def left
68
- leftmost_endpoint.x
69
- end
70
-
71
- def right
72
- rightmost_endpoint.x
73
- end
74
- def length
75
- Geometry.distance(point1, point2)
76
- end
77
- end
78
- end
79
-
80
- def Segment(point1, point2)
81
- Geometry::Segment.new point1, point2
82
- end
@@ -1,132 +0,0 @@
1
- require 'observer'
2
-
3
- require_relative './entities.rb'
4
-
5
- require 'java'
6
- require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
7
- java_import org.apache.pdfbox.pdfparser.PDFParser
8
- java_import org.apache.pdfbox.pdmodel.PDDocument
9
- java_import org.apache.pdfbox.util.PDFTextStripper
10
- java_import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial
11
-
12
- module Tabula
13
-
14
- module Extraction
15
-
16
- def Extraction.openPDF(pdf_filename, password='')
17
- raise Errno::ENOENT unless File.exists?(pdf_filename)
18
- document = PDDocument.load(pdf_filename)
19
- if document.isEncrypted
20
- sdm = StandardDecryptionMaterial.new(password)
21
- document.openProtection(sdm)
22
- end
23
- document
24
- end
25
-
26
- class TextExtractor < org.apache.pdfbox.util.PDFTextStripper
27
-
28
- attr_accessor :characters, :fonts
29
-
30
- PRINTABLE_RE = /[[:print:]]/
31
-
32
- def initialize
33
- super
34
- self.fonts = {}
35
- self.characters = []
36
- self.setSortByPosition(true)
37
- end
38
-
39
- def clear!
40
- self.characters = []; self.fonts = {}
41
- end
42
-
43
- def processTextPosition(text)
44
- # return if text.getCharacter == ' '
45
-
46
- # text_font = text.getFont
47
- # text_size = text.getFontSize
48
- # font_plus_size = self.fonts.select { |k, v| v == text_font }.first.first + "-" + text_size.to_i.to_s
49
-
50
- # $fonts[$current_page].merge!({
51
- # font_plus_size => { :family => text_font.getBaseFont, :size => text_size }
52
- # })
53
-
54
- # $page_contents[$current_page] += " <text top=\"%.2f\" left=\"%.2f\" width=\"%.2f\" height=\"%.2f\" font=\"#{font_plus_size}\" dir=\"#{text.getDir}\">#{text.getCharacter}</text>\n" % [text.getYDirAdj - text.getHeightDir, text.getXDirAdj, text.getWidthDirAdj, text.getHeightDir]
55
-
56
- c = text.getCharacter
57
- # probably not the fastest way of detecting printable chars
58
- self.characters << text if c =~ PRINTABLE_RE
59
-
60
- end
61
- end
62
-
63
- class PagesInfoExtractor
64
- def initialize(pdf_filename, password='')
65
- @pdf_file = Extraction.openPDF(pdf_filename, password)
66
- @all_pages = @pdf_file.getDocumentCatalog.getAllPages
67
- end
68
-
69
- def pages
70
- Enumerator.new do |y|
71
- begin
72
- @all_pages.each_with_index do |page, i|
73
- contents = page.getContents
74
- # next if contents.nil?
75
- y.yield Tabula::Page.new(page.findCropBox.width,
76
- page.findCropBox.height,
77
- page.getRotation.to_i,
78
- i+1)
79
- end
80
- ensure
81
- @pdf_file.close
82
- end
83
- end
84
- end
85
- end
86
-
87
-
88
- class CharacterExtractor
89
- include Observable
90
-
91
- #N.B. pages can be :all, a list of pages or a range.
92
- def initialize(pdf_filename, pages=[1], password='')
93
- raise Errno::ENOENT unless File.exists?(pdf_filename)
94
- @pdf_file = Extraction.openPDF(pdf_filename, password)
95
- @all_pages = @pdf_file.getDocumentCatalog.getAllPages
96
- @pages = pages == :all ? (1..@all_pages.size) : pages
97
- @extractor = TextExtractor.new
98
- end
99
-
100
- def extract
101
- Enumerator.new do |y|
102
- begin
103
- @pages.each do |i|
104
- page = @all_pages.get(i-1)
105
- contents = page.getContents
106
- next if contents.nil?
107
- @extractor.clear!
108
- @extractor.processStream(page, page.findResources, contents.getStream)
109
-
110
- y.yield Tabula::Page.new(page.findCropBox.width,
111
- page.findCropBox.height,
112
- page.getRotation.to_i,
113
- i+1,
114
- @extractor.characters.map { |char|
115
- Tabula::TextElement.new(char.getYDirAdj.round(2),
116
- char.getXDirAdj.round(2),
117
- char.getWidthDirAdj.round(2),
118
- char.getHeightDir.round(2),
119
- char.getFont,
120
- char.getFontSize.round(2),
121
- char.getCharacter,
122
- char.getWidthOfSpace)
123
- })
124
- end
125
- ensure
126
- @pdf_file.close
127
- end # begin
128
- end
129
- end
130
- end
131
- end
132
- end