tabula-extractor 0.6.6-java → 0.7.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS.md +1 -0
  3. data/README.md +27 -11
  4. data/bin/tabula +61 -19
  5. data/ext/liblsd-linux32.so +0 -0
  6. data/ext/liblsd-linux64.so +0 -0
  7. data/ext/liblsd.dll +0 -0
  8. data/ext/liblsd.dylib +0 -0
  9. data/ext/liblsd64.dll +0 -0
  10. data/ext/lsd.c +137 -137
  11. data/ext/lsd.h +9 -9
  12. data/lib/tabula.rb +20 -3
  13. data/lib/tabula/core_ext.rb +261 -0
  14. data/lib/tabula/entities.rb +11 -456
  15. data/lib/tabula/entities/cell.rb +42 -0
  16. data/lib/tabula/entities/has_cells.rb +244 -0
  17. data/lib/tabula/entities/line.rb +39 -0
  18. data/lib/tabula/entities/page.rb +269 -0
  19. data/lib/tabula/entities/page_area.rb +7 -0
  20. data/lib/tabula/entities/ruling.rb +300 -0
  21. data/lib/tabula/entities/spreadsheet.rb +92 -0
  22. data/lib/tabula/entities/table.rb +81 -0
  23. data/lib/tabula/entities/text_chunk.rb +114 -0
  24. data/lib/tabula/entities/text_element.rb +112 -0
  25. data/lib/tabula/entities/zone_entity.rb +57 -0
  26. data/lib/tabula/extraction.rb +327 -0
  27. data/lib/tabula/line_segment_detector.rb +9 -7
  28. data/lib/tabula/pdf_line_extractor.rb +319 -0
  29. data/lib/tabula/pdf_render.rb +1 -5
  30. data/lib/tabula/spreadsheet_extractor.rb +52 -0
  31. data/lib/tabula/table_extractor.rb +50 -348
  32. data/lib/tabula/table_guesser.rb +21 -23
  33. data/lib/tabula/version.rb +1 -1
  34. data/lib/tabula/writers.rb +5 -6
  35. data/tabula-extractor.gemspec +1 -0
  36. data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
  37. data/test/data/47008204D_USA.page4.pdf +0 -0
  38. data/test/data/560015757GV_China.page1.pdf +0 -0
  39. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  40. data/test/data/S2MNCEbirdisland.pdf +0 -0
  41. data/test/data/campaign_donors.pdf +0 -0
  42. data/test/data/frx_2012_disclosure.tsv +88 -0
  43. data/test/data/no_tables.pdf +0 -0
  44. data/test/data/puertos1.pdf +0 -0
  45. data/test/data/spanning_cells.csv +21 -0
  46. data/test/data/spanning_cells.pdf +0 -0
  47. data/test/data/strongschools.pdf +0 -0
  48. data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
  49. data/test/data/vietnam3.pdf +0 -0
  50. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  51. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  52. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  53. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  54. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  55. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  56. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  57. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  58. data/test/heuristic.rb +50 -0
  59. data/test/test_bin_tabula.sh +7 -0
  60. data/test/tests.rb +476 -63
  61. metadata +79 -28
  62. data/lib/geom/point.rb +0 -21
  63. data/lib/geom/rectangle.rb +0 -101
  64. data/lib/geom/segment.rb +0 -82
  65. data/lib/tabula/pdf_dump.rb +0 -132
  66. data/lib/tabula/whitespace.rb +0 -50
  67. data/vertical_rulings_bug.rb +0 -29
metadata CHANGED
@@ -1,8 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 0.6.6
4
+ version: 0.7.0
6
5
  platform: java
7
6
  authors:
8
7
  - Manuel Aristarán
@@ -11,7 +10,7 @@ authors:
11
10
  autorequire:
12
11
  bindir: bin
13
12
  cert_chain: []
14
- date: 2013-08-23 00:00:00.000000000 Z
13
+ date: 2014-01-07 00:00:00.000000000 Z
15
14
  dependencies:
16
15
  - !ruby/object:Gem::Dependency
17
16
  name: minitest
@@ -20,13 +19,11 @@ dependencies:
20
19
  - - '>='
21
20
  - !ruby/object:Gem::Version
22
21
  version: '0'
23
- none: false
24
22
  requirement: !ruby/object:Gem::Requirement
25
23
  requirements:
26
24
  - - '>='
27
25
  - !ruby/object:Gem::Version
28
26
  version: '0'
29
- none: false
30
27
  prerelease: false
31
28
  type: :development
32
29
  - !ruby/object:Gem::Dependency
@@ -36,13 +33,11 @@ dependencies:
36
33
  - - '>='
37
34
  - !ruby/object:Gem::Version
38
35
  version: 1.3.4
39
- none: false
40
36
  requirement: !ruby/object:Gem::Requirement
41
37
  requirements:
42
38
  - - '>='
43
39
  - !ruby/object:Gem::Version
44
40
  version: 1.3.4
45
- none: false
46
41
  prerelease: false
47
42
  type: :development
48
43
  - !ruby/object:Gem::Dependency
@@ -52,13 +47,25 @@ dependencies:
52
47
  - - '>='
53
48
  - !ruby/object:Gem::Version
54
49
  version: '0'
55
- none: false
56
50
  requirement: !ruby/object:Gem::Requirement
57
51
  requirements:
58
52
  - - '>='
59
53
  - !ruby/object:Gem::Version
60
54
  version: '0'
61
- none: false
55
+ prerelease: false
56
+ type: :development
57
+ - !ruby/object:Gem::Dependency
58
+ name: pry
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirement: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
62
69
  prerelease: false
63
70
  type: :development
64
71
  - !ruby/object:Gem::Dependency
@@ -68,13 +75,11 @@ dependencies:
68
75
  - - ~>
69
76
  - !ruby/object:Gem::Version
70
77
  version: '2.0'
71
- none: false
72
78
  requirement: !ruby/object:Gem::Requirement
73
79
  requirements:
74
80
  - - ~>
75
81
  - !ruby/object:Gem::Version
76
82
  version: '2.0'
77
- none: false
78
83
  prerelease: false
79
84
  type: :runtime
80
85
  description: extract tables from PDF files
@@ -109,34 +114,65 @@ files:
109
114
  - ext/liblsd64.dll
110
115
  - ext/lsd.c
111
116
  - ext/lsd.h
112
- - lib/geom/point.rb
113
- - lib/geom/rectangle.rb
114
- - lib/geom/segment.rb
115
117
  - lib/tabula.rb
116
118
  - lib/tabula/core_ext.rb
117
119
  - lib/tabula/entities.rb
120
+ - lib/tabula/entities/cell.rb
121
+ - lib/tabula/entities/has_cells.rb
122
+ - lib/tabula/entities/line.rb
123
+ - lib/tabula/entities/page.rb
124
+ - lib/tabula/entities/page_area.rb
125
+ - lib/tabula/entities/ruling.rb
126
+ - lib/tabula/entities/spreadsheet.rb
127
+ - lib/tabula/entities/table.rb
128
+ - lib/tabula/entities/text_chunk.rb
129
+ - lib/tabula/entities/text_element.rb
130
+ - lib/tabula/entities/zone_entity.rb
131
+ - lib/tabula/extraction.rb
118
132
  - lib/tabula/line_segment_detector.rb
119
- - lib/tabula/pdf_dump.rb
133
+ - lib/tabula/pdf_line_extractor.rb
120
134
  - lib/tabula/pdf_render.rb
135
+ - lib/tabula/spreadsheet_extractor.rb
121
136
  - lib/tabula/table_extractor.rb
122
137
  - lib/tabula/table_guesser.rb
123
138
  - lib/tabula/version.rb
124
- - lib/tabula/whitespace.rb
125
139
  - lib/tabula/writers.rb
126
140
  - tabula-extractor.gemspec
127
141
  - target/pdfbox-app-2.0.0-SNAPSHOT.jar
142
+ - test/data/47008204D_USA.page4.pdf
143
+ - test/data/560015757GV_China.page1.pdf
128
144
  - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
145
+ - test/data/GSK_2012_Q4.page437.pdf
146
+ - test/data/S2MNCEbirdisland.pdf
129
147
  - test/data/argentina_diputados_voting_record.pdf
130
148
  - test/data/bo_page24.pdf
149
+ - test/data/campaign_donors.pdf
131
150
  - test/data/frx_2012_disclosure.pdf
151
+ - test/data/frx_2012_disclosure.tsv
132
152
  - test/data/gre.pdf
153
+ - test/data/no_tables.pdf
154
+ - test/data/puertos1.pdf
155
+ - test/data/spanning_cells.csv
156
+ - test/data/spanning_cells.pdf
157
+ - test/data/strongschools.pdf
133
158
  - test/data/tabla_subsidios.pdf
159
+ - test/data/vertical_rulings_bug.pdf
160
+ - test/data/vietnam3.pdf
161
+ - test/heuristic-test-set/original/560015757GV_China.page1.pdf
162
+ - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
163
+ - test/heuristic-test-set/original/bo_page24.pdf
164
+ - test/heuristic-test-set/original/campaign_donors.pdf
165
+ - test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
166
+ - test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
167
+ - test/heuristic-test-set/spreadsheet/strongschools.pdf
168
+ - test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
169
+ - test/heuristic.rb
170
+ - test/test_bin_tabula.sh
134
171
  - test/tests.rb
135
- - vertical_rulings_bug.pdf
136
- - vertical_rulings_bug.rb
137
172
  homepage: https://github.com/jazzido/tabula-extractor
138
173
  licenses:
139
174
  - MIT
175
+ metadata: {}
140
176
  post_install_message:
141
177
  rdoc_options: []
142
178
  require_paths:
@@ -145,31 +181,46 @@ required_ruby_version: !ruby/object:Gem::Requirement
145
181
  requirements:
146
182
  - - '>='
147
183
  - !ruby/object:Gem::Version
148
- segments:
149
- - 0
150
- hash: 2
151
184
  version: '0'
152
- none: false
153
185
  required_rubygems_version: !ruby/object:Gem::Requirement
154
186
  requirements:
155
187
  - - '>='
156
188
  - !ruby/object:Gem::Version
157
- segments:
158
- - 0
159
- hash: 2
160
189
  version: '0'
161
- none: false
162
190
  requirements: []
163
191
  rubyforge_project:
164
- rubygems_version: 1.8.24
192
+ rubygems_version: 2.1.9
165
193
  signing_key:
166
- specification_version: 3
194
+ specification_version: 4
167
195
  summary: extract tables from PDF files
168
196
  test_files:
197
+ - test/data/47008204D_USA.page4.pdf
198
+ - test/data/560015757GV_China.page1.pdf
169
199
  - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
200
+ - test/data/GSK_2012_Q4.page437.pdf
201
+ - test/data/S2MNCEbirdisland.pdf
170
202
  - test/data/argentina_diputados_voting_record.pdf
171
203
  - test/data/bo_page24.pdf
204
+ - test/data/campaign_donors.pdf
172
205
  - test/data/frx_2012_disclosure.pdf
206
+ - test/data/frx_2012_disclosure.tsv
173
207
  - test/data/gre.pdf
208
+ - test/data/no_tables.pdf
209
+ - test/data/puertos1.pdf
210
+ - test/data/spanning_cells.csv
211
+ - test/data/spanning_cells.pdf
212
+ - test/data/strongschools.pdf
174
213
  - test/data/tabla_subsidios.pdf
214
+ - test/data/vertical_rulings_bug.pdf
215
+ - test/data/vietnam3.pdf
216
+ - test/heuristic-test-set/original/560015757GV_China.page1.pdf
217
+ - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
218
+ - test/heuristic-test-set/original/bo_page24.pdf
219
+ - test/heuristic-test-set/original/campaign_donors.pdf
220
+ - test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
221
+ - test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
222
+ - test/heuristic-test-set/spreadsheet/strongschools.pdf
223
+ - test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
224
+ - test/heuristic.rb
225
+ - test/test_bin_tabula.sh
175
226
  - test/tests.rb
data/lib/geom/point.rb DELETED
@@ -1,21 +0,0 @@
1
- #
2
- # Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
3
- # MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
4
- #
5
-
6
-
7
- module Geometry
8
- class Point < Struct.new(:x, :y)
9
- def self.new_by_array(array)
10
- self.new(array[0], array[1])
11
- end
12
-
13
- def ==(another_point)
14
- x === another_point.x && y === another_point.y
15
- end
16
- end
17
- end
18
-
19
- def Point(x, y)
20
- Geometry::Point.new(x, y)
21
- end
@@ -1,101 +0,0 @@
1
- #
2
- # Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
3
- # MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
4
- #
5
-
6
-
7
- module Geometry
8
- class Rectangle < Struct.new(:point1, :point2)
9
- SIMILARITY_DIVISOR = 20
10
-
11
- def Rectangle.unionize(non_overlapping_rectangles, next_rect)
12
- #if next_rect doesn't overlap any of non_overlapping_rectangles
13
- if (overlapping = non_overlapping_rectangles.select{|r| next_rect.overlaps? r}) && !non_overlapping_rectangles.empty?
14
- #remove all of those that it overlaps from non_overlapping_rectangles and
15
- non_overlapping_rectangles -= overlapping
16
- #add to non_overlapping_rectangles the bounding box of the overlapping rectangles.
17
- non_overlapping_rectangles << overlapping.inject(next_rect){|memo, overlap| memo.bounding_box(overlap) }
18
-
19
- else
20
- non_overlapping_rectangles << next_rect
21
- end
22
- end
23
-
24
- def self.new_by_x_y_dims(x, y, width, height)
25
- self.new( Point.new_by_array([x, y]), Point.new_by_array([x + width, y + height]) )
26
- end
27
-
28
- def x
29
- [point1.x, point2.x].min
30
- end
31
-
32
- alias_method :left, :x
33
-
34
- def y
35
- #puts "y: [#{point1.y} #{point2.y}].min"
36
- [point1.y, point2.y].min
37
- end
38
-
39
- alias_method :top, :y
40
-
41
- def x2
42
- [point1.x, point2.x].max
43
- end
44
-
45
- alias_method :right, :x2
46
-
47
- def y2
48
- #puts "y2: [#{point1.y} #{point2.y}].max"
49
- [point1.y, point2.y].max
50
- end
51
-
52
- alias_method :bottom, :y2
53
-
54
-
55
- def width
56
- (point1.x - point2.x).abs
57
- end
58
-
59
- def height
60
- (point1.y - point2.y).abs
61
- end
62
-
63
- def area
64
- self.width * self.height
65
- end
66
-
67
- def similarity_hash
68
- [self.x.to_i / SIMILARITY_DIVISOR, self.y.to_i / SIMILARITY_DIVISOR, self.width.to_i / SIMILARITY_DIVISOR, self.height.to_i / SIMILARITY_DIVISOR].to_s
69
- end
70
-
71
- def dims(*format)
72
- if format
73
- format.map{|method| self.send(method)}
74
- else
75
- [self.x, self.y, self.width, self.height]
76
- end
77
- end
78
-
79
- def contains?(other_x, other_y)
80
- (other_x <= x2 && other_x >= x ) && (other_y <= y2 && other_y > y)
81
- end
82
-
83
- def overlaps?(other_rect)
84
- return contains?(other_rect.x, other_rect.y) || contains?(other_rect.x2, other_rect.y2) ||
85
- contains?(other_rect.x, other_rect.y2) || contains?(other_rect.x2, other_rect.y) ||
86
- other_rect.contains?(x, y) || other_rect.contains?(x2, y2) ||
87
- other_rect.contains?(x, y2) || other_rect.contains?(x2, y)
88
- end
89
-
90
- def bounding_box(other_rect)
91
- #new rect with bounding box of these two
92
- new_x1 = [x, other_rect.x].min
93
- new_y1 = [x, other_rect.y].min
94
- new_x2 = [x2, other_rect.x2].max
95
- new_y2 = [y2, other_rect.y2].max
96
- new_width = (new_x2 - new_x1).abs
97
- new_height = (new_y2 - new_y1).abs
98
- Rectangle.new_by_x_y_dims(new_x1, new_y1, new_width, new_height)
99
- end
100
- end
101
- end
data/lib/geom/segment.rb DELETED
@@ -1,82 +0,0 @@
1
- #
2
- # Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
3
- # MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
4
- #
5
-
6
-
7
- module Geometry
8
- include Math
9
- extend Math
10
-
11
- def Geometry.distance(point1, point2)
12
- hypot point1.x - point2.x, point1.y - point2.y
13
- end
14
-
15
-
16
- class Segment < Struct.new(:point1, :point2)
17
- def self.new_by_arrays(point1_coordinates, point2_coordinates)
18
- self.new(Point.new_by_array(point1_coordinates),
19
- Point.new_by_array(point2_coordinates))
20
- end
21
-
22
- def scale!(scale_factor)
23
- self.point1.x = self.point1.x * scale_factor
24
- self.point1.y = self.point1.y * scale_factor
25
- self.point2.x = self.point2.x * scale_factor
26
- self.point2.y = self.point2.y * scale_factor
27
- end
28
-
29
- def vertical?
30
- point1.x == point2.x
31
- end
32
-
33
- def horizontal?
34
- point1.y == point2.y
35
- end
36
-
37
- def leftmost_endpoint
38
- ((point1.x <=> point2.x) == -1) ? point1 : point2
39
- end
40
-
41
- def rightmost_endpoint
42
- ((point1.x <=> point2.x) == 1) ? point1 : point2
43
- end
44
-
45
- def topmost_endpoint
46
- ((point1.y <=> point2.y) == 1) ? point1 : point2
47
- end
48
-
49
- def bottommost_endpoint
50
- ((point1.y <=> point2.y) == -1) ? point1 : point2
51
- end
52
-
53
- def top
54
- topmost_endpoint.y
55
- end
56
-
57
- def bottom
58
- bottommost_endpoint.y
59
- end
60
- def width
61
- (left - right).abs
62
- end
63
- def height
64
- (bottom - top).abs
65
- end
66
-
67
- def left
68
- leftmost_endpoint.x
69
- end
70
-
71
- def right
72
- rightmost_endpoint.x
73
- end
74
- def length
75
- Geometry.distance(point1, point2)
76
- end
77
- end
78
- end
79
-
80
- def Segment(point1, point2)
81
- Geometry::Segment.new point1, point2
82
- end
@@ -1,132 +0,0 @@
1
- require 'observer'
2
-
3
- require_relative './entities.rb'
4
-
5
- require 'java'
6
- require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
7
- java_import org.apache.pdfbox.pdfparser.PDFParser
8
- java_import org.apache.pdfbox.pdmodel.PDDocument
9
- java_import org.apache.pdfbox.util.PDFTextStripper
10
- java_import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial
11
-
12
- module Tabula
13
-
14
- module Extraction
15
-
16
- def Extraction.openPDF(pdf_filename, password='')
17
- raise Errno::ENOENT unless File.exists?(pdf_filename)
18
- document = PDDocument.load(pdf_filename)
19
- if document.isEncrypted
20
- sdm = StandardDecryptionMaterial.new(password)
21
- document.openProtection(sdm)
22
- end
23
- document
24
- end
25
-
26
- class TextExtractor < org.apache.pdfbox.util.PDFTextStripper
27
-
28
- attr_accessor :characters, :fonts
29
-
30
- PRINTABLE_RE = /[[:print:]]/
31
-
32
- def initialize
33
- super
34
- self.fonts = {}
35
- self.characters = []
36
- self.setSortByPosition(true)
37
- end
38
-
39
- def clear!
40
- self.characters = []; self.fonts = {}
41
- end
42
-
43
- def processTextPosition(text)
44
- # return if text.getCharacter == ' '
45
-
46
- # text_font = text.getFont
47
- # text_size = text.getFontSize
48
- # font_plus_size = self.fonts.select { |k, v| v == text_font }.first.first + "-" + text_size.to_i.to_s
49
-
50
- # $fonts[$current_page].merge!({
51
- # font_plus_size => { :family => text_font.getBaseFont, :size => text_size }
52
- # })
53
-
54
- # $page_contents[$current_page] += " <text top=\"%.2f\" left=\"%.2f\" width=\"%.2f\" height=\"%.2f\" font=\"#{font_plus_size}\" dir=\"#{text.getDir}\">#{text.getCharacter}</text>\n" % [text.getYDirAdj - text.getHeightDir, text.getXDirAdj, text.getWidthDirAdj, text.getHeightDir]
55
-
56
- c = text.getCharacter
57
- # probably not the fastest way of detecting printable chars
58
- self.characters << text if c =~ PRINTABLE_RE
59
-
60
- end
61
- end
62
-
63
- class PagesInfoExtractor
64
- def initialize(pdf_filename, password='')
65
- @pdf_file = Extraction.openPDF(pdf_filename, password)
66
- @all_pages = @pdf_file.getDocumentCatalog.getAllPages
67
- end
68
-
69
- def pages
70
- Enumerator.new do |y|
71
- begin
72
- @all_pages.each_with_index do |page, i|
73
- contents = page.getContents
74
- # next if contents.nil?
75
- y.yield Tabula::Page.new(page.findCropBox.width,
76
- page.findCropBox.height,
77
- page.getRotation.to_i,
78
- i+1)
79
- end
80
- ensure
81
- @pdf_file.close
82
- end
83
- end
84
- end
85
- end
86
-
87
-
88
- class CharacterExtractor
89
- include Observable
90
-
91
- #N.B. pages can be :all, a list of pages or a range.
92
- def initialize(pdf_filename, pages=[1], password='')
93
- raise Errno::ENOENT unless File.exists?(pdf_filename)
94
- @pdf_file = Extraction.openPDF(pdf_filename, password)
95
- @all_pages = @pdf_file.getDocumentCatalog.getAllPages
96
- @pages = pages == :all ? (1..@all_pages.size) : pages
97
- @extractor = TextExtractor.new
98
- end
99
-
100
- def extract
101
- Enumerator.new do |y|
102
- begin
103
- @pages.each do |i|
104
- page = @all_pages.get(i-1)
105
- contents = page.getContents
106
- next if contents.nil?
107
- @extractor.clear!
108
- @extractor.processStream(page, page.findResources, contents.getStream)
109
-
110
- y.yield Tabula::Page.new(page.findCropBox.width,
111
- page.findCropBox.height,
112
- page.getRotation.to_i,
113
- i+1,
114
- @extractor.characters.map { |char|
115
- Tabula::TextElement.new(char.getYDirAdj.round(2),
116
- char.getXDirAdj.round(2),
117
- char.getWidthDirAdj.round(2),
118
- char.getHeightDir.round(2),
119
- char.getFont,
120
- char.getFontSize.round(2),
121
- char.getCharacter,
122
- char.getWidthOfSpace)
123
- })
124
- end
125
- ensure
126
- @pdf_file.close
127
- end # begin
128
- end
129
- end
130
- end
131
- end
132
- end