tabula-extractor 0.6.5-java → 0.6.6-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -232,6 +232,10 @@ module Tabula
232
232
  self.height = t.height
233
233
  else
234
234
  if in_same_column = self.text_elements.find { |te| te.horizontally_overlaps?(t) }
235
+ #sometimes a space needs to be added here
236
+ unless in_same_column.vertically_overlaps?(t)
237
+ t.text = " " + t.text
238
+ end
235
239
  in_same_column.merge!(t)
236
240
  else
237
241
  self.text_elements << t
@@ -14,7 +14,15 @@ module Tabula
14
14
  def initialize(text_elements, options = {})
15
15
  self.text_elements = text_elements
16
16
  self.options = DEFAULT_OPTIONS.merge(options)
17
- merge_words! if self.options[:merge_words]
17
+
18
+ if self.options[:merge_words]
19
+ if self.options[:vertical_rulings]
20
+ merge_words_in_a_vertical_rulings_aware_manner!(self.options[:vertical_rulings])
21
+ else
22
+ merge_words!
23
+ end
24
+ end
25
+
18
26
  end
19
27
 
20
28
  def get_rows
@@ -49,7 +57,7 @@ module Tabula
49
57
  if column = columns.detect { |c| te.horizontally_overlaps?(c) }
50
58
  column << te
51
59
  else
52
- puts "couldn't find a place for #{te.inspect}"
60
+ #puts "couldn't find a place for #{te.inspect}"
53
61
  #columns << Column.new(te.left, te.width, [te])
54
62
  end
55
63
  end
@@ -134,21 +142,69 @@ module Tabula
134
142
  self.text_elements.compact!
135
143
  return self.text_elements
136
144
  end
145
+
146
+ #this is where spaces come from!
147
+ def merge_words_in_a_vertical_rulings_aware_manner!(vertical_rulings)
148
+ #don't merge words across a ruling.
149
+
150
+ return self.text_elements if @merged # only merge once. awful hack.
151
+ @merged = true
152
+ current_word_index = i = 0
153
+ char1 = self.text_elements[i]
154
+ vertical_ruling_locations = vertical_rulings.map &:left
155
+
156
+ while i < self.text_elements.size-1 do
157
+
158
+ char2 = self.text_elements[i+1]
159
+
160
+ next if char2.nil? or char1.nil?
161
+
162
+ if self.text_elements[current_word_index].should_merge?(char2)
163
+ unless vertical_ruling_locations.map{|loc| self.text_elements[current_word_index].left < loc && char2.left > loc}.include?(true)
164
+ self.text_elements[current_word_index].merge!(char2)
165
+ end
166
+
167
+ char1 = char2
168
+ self.text_elements[i+1] = nil
169
+ else
170
+ # is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
171
+ if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
172
+ self.text_elements[current_word_index].text += " "
173
+ #self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
174
+ end
175
+ current_word_index = i+1
176
+ end
177
+ i += 1
178
+ end
179
+ self.text_elements.compact!
180
+ return self.text_elements
181
+ end
137
182
  end
138
183
 
139
- # TODO next four module methods are deprecated
184
+ ##
185
+ # Deprecated.
186
+ ##
140
187
  def Tabula.group_by_columns(text_elements, merge_words=false)
141
188
  TableExtractor.new(text_elements, :merge_words => merge_words).group_by_columns
142
189
  end
143
190
 
191
+ ##
192
+ # Deprecated.
193
+ ##
144
194
  def Tabula.get_line_boundaries(text_elements)
145
195
  TableExtractor.new(text_elements).get_line_boundaries
146
196
  end
147
197
 
198
+ ##
199
+ # Deprecated.
200
+ ##
148
201
  def Tabula.get_columns(text_elements, merge_words=true)
149
202
  TableExtractor.new(text_elements, :merge_words => merge_words).get_columns
150
203
  end
151
204
 
205
+ ##
206
+ # Deprecated.
207
+ ##
152
208
  def Tabula.get_rows(text_elements, merge_words=true)
153
209
  TableExtractor.new(text_elements, :merge_words => merge_words).get_rows
154
210
  end
@@ -256,7 +312,7 @@ module Tabula
256
312
  vertical_rulings = options[:vertical_rulings]
257
313
  columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words], :vertical_rulings => vertical_rulings}).group_by_columns.sort_by(&:left)
258
314
 
259
- # insert empty cells if needed
315
+ # insert an empty cell in a given column if there's no text elements within that column's boundaries
260
316
  lines.each_with_index do |l, line_index|
261
317
  next if l.text_elements.nil?
262
318
  l.text_elements.compact! # TODO WHY do I have to do this?
@@ -271,26 +327,28 @@ module Tabula
271
327
  end
272
328
 
273
329
  # merge elements that are in the same column
274
- lines.each_with_index do |l, line_index|
275
- next if l.text_elements.nil?
276
-
277
- (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2| #don't remove a string of empty cells
278
- next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
279
-
280
- # if same column...
281
- if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
282
- == columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
283
- if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
284
- l.text_elements[t1].merge!(l.text_elements[t2])
285
- l.text_elements[t2] = nil
286
- else
287
- l.text_elements[t2].merge!(l.text_elements[t1])
288
- l.text_elements[t1] = nil
330
+ unless options[:dontmerge]
331
+ lines.each_with_index do |l, line_index|
332
+ next if l.text_elements.nil?
333
+
334
+ (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2| #don't remove a string of empty cells
335
+ next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
336
+
337
+ # if same column...
338
+ if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
339
+ == columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
340
+ if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
341
+ l.text_elements[t1].merge!(l.text_elements[t2])
342
+ l.text_elements[t2] = nil
343
+ else
344
+ l.text_elements[t2].merge!(l.text_elements[t1])
345
+ l.text_elements[t1] = nil
346
+ end
289
347
  end
290
348
  end
291
- end
292
349
 
293
- l.text_elements.compact!
350
+ l.text_elements.compact!
351
+ end
294
352
  end
295
353
 
296
354
  # remove duplicate lines
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.6.5'
2
+ VERSION = '0.6.6'
3
3
  end
Binary file
@@ -0,0 +1,29 @@
1
+ require './lib/tabula'
2
+
3
+ input_filename = "vertical_rulings_bug.pdf"
4
+ out = File.new("output.xls", 'w')
5
+
6
+ extractor = Tabula::Extraction::CharacterExtractor.new(input_filename, :all) #:all ) # 1..2643
7
+ extractor.extract.each_with_index do |pdf_page, page_index|
8
+
9
+ lines = Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(input_filename, page_index))
10
+ page_areas = [[0, 0, 1000, 1700]]
11
+
12
+ scale_factor = pdf_page.width / 1700
13
+ puts scale_factor
14
+
15
+ vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Geometry::Segment.new_by_arrays([n * scale_factor, 0], [n * scale_factor, 1000])}
16
+
17
+ page_areas.each do |page_area|
18
+ text = pdf_page.get_text( page_area ) #all the characters within the given area.
19
+
20
+ Tabula::Writers.send(:TSV,
21
+ Tabula.make_table_with_vertical_rulings(text, {:vertical_rulings => vertical_rulings, :merge_words => true, :dontmerge => true}),
22
+ out)
23
+ end
24
+ end
25
+ out.close
26
+
27
+
28
+ #with dontmerge false (i.e. if we merge) we get crap. STCITY and no spaces in any cities.
29
+ #with dontmerge true (or commented out), MORGANTOWNWV, and some spaces (e.g. BRYN MAWR, but not FRESHMEADOWS)
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.6.5
5
+ version: 0.6.6
6
6
  platform: java
7
7
  authors:
8
8
  - Manuel Aristarán
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2013-07-23 00:00:00.000000000 Z
14
+ date: 2013-08-23 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: minitest
@@ -132,6 +132,8 @@ files:
132
132
  - test/data/gre.pdf
133
133
  - test/data/tabla_subsidios.pdf
134
134
  - test/tests.rb
135
+ - vertical_rulings_bug.pdf
136
+ - vertical_rulings_bug.rb
135
137
  homepage: https://github.com/jazzido/tabula-extractor
136
138
  licenses:
137
139
  - MIT