tabula-extractor 0.7.5-java → 0.7.6-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4391bc1af8143d2f60ed2ad11a66cba9f96c955e
4
- data.tar.gz: d0d905fbd5b2bae105a11a9fd01470921b7dd4f3
3
+ metadata.gz: 9ac7f1daa082acce10e82b94b01b31e07813ad4d
4
+ data.tar.gz: ac521bbba80d6b0571d904565cd31d9af5e7947a
5
5
  SHA512:
6
- metadata.gz: 4ef1e681e511dc074381696689b8d86915f262d4538cf107be6b8844fcbf7102f20cedcdc8964f326178095e7cd9b47d912386801e57a0e4645ee27faacff4a5
7
- data.tar.gz: 336c19a84cd2cf430e24ce0728a5f7753ca78e3b08e2f12251274da577dd2f0cfe2d6a5cf1b9df44dd09e85d9c4e5a80d2672e53632785bc70461f8ca10c3e17
6
+ metadata.gz: 0389d96e5a7a8ad20c147ed3170b922a501126211bec58b012f39662425599437f3f869002825b562cae57d44645ddb776088804f237e168b71473211a86c67a
7
+ data.tar.gz: 53dd7bd11684bf8b8ccd03ea9352140eb3dcc346b7018bee7d6b8049e7e70ee02f59b83e360d97ddf5b7f211a1dccbb83694aab8cb016c7a4ba656f46a37c4c4
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  tabula-extractor
2
2
  ================
3
3
 
4
- [![Build Status](https://travis-ci.org/jazzido/tabula-extractor.png)](https://travis-ci.org/jazzido/tabula-extractor)
4
+ [![Build Status](https://travis-ci.org/tabulapdf/tabula-extractor.png)](https://travis-ci.org/tabulapdf/tabula-extractor)
5
5
 
6
6
  Extract tables from PDF files. `tabula-extractor` is the table extraction engine that powers [Tabula](http://tabula.technology), now available as a library and command line program.
7
7
 
@@ -125,7 +125,7 @@ extractor.extract.each_with_index do |pdf_page, page_index|
125
125
  page_areas = [[250, 0, 325, 1700]]
126
126
 
127
127
  page_areas.each do |page_area|
128
- out << pdf_page.get_area(page_area).make_table.to_csv
128
+ out << pdf_page.get_area(page_area).get_table.to_csv
129
129
  out << "\n\n"
130
130
  end
131
131
 
@@ -155,7 +155,7 @@ extractor.extract.each_with_index do |pdf_page, page_index|
155
155
  vertical_rulings = vertical_ruling_locations.map{|n| Tabula::Ruling.new(0, n * scale_factor, 0, 1000)}
156
156
 
157
157
  page_areas.each do |page_area|
158
- out << pdf_page.get_area(page_area).make_table(:vertical_rulings => vertical_rulings).to_csv
158
+ out << pdf_page.get_area(page_area).get_table(:vertical_rulings => vertical_rulings).to_csv
159
159
  out << "\n\n"
160
160
  end
161
161
  end
@@ -191,6 +191,10 @@ module Tabula
191
191
  end
192
192
  end
193
193
 
194
+ def has_text?
195
+ !self.texts.empty?
196
+ end
197
+
194
198
  # TODO no need for this, let's choose one name
195
199
  def ruling_lines
196
200
  get_ruling_lines!
@@ -258,7 +262,7 @@ module Tabula
258
262
  :height => self.height,
259
263
  :number => self.number,
260
264
  :rotation => self.rotation,
261
- :texts => self.texts
265
+ :hasText => self.has_text?
262
266
  }.to_json(options)
263
267
  end
264
268
 
@@ -239,7 +239,8 @@ module Tabula
239
239
  end
240
240
 
241
241
  def finite?
242
- top != ::Float::INFINITY && left != ::Float::INFINITY && bottom != ::Float::INFINITY && right != ::Float::INFINITY
242
+ top != ::Float::INFINITY && left != ::Float::INFINITY && bottom != ::Float::INFINITY && right != ::Float::INFINITY &&
243
+ !top.nan? && !left.nan? && !bottom.nan? && !right.nan?
243
244
  end
244
245
 
245
246
  ##
@@ -371,26 +371,37 @@ module Tabula
371
371
 
372
372
 
373
373
  class PagesInfoExtractor
374
- def initialize(pdf_filename, password='')
375
- @pdf_filename = pdf_filename
376
- @pdf_file = Extraction.openPDF(pdf_filename, password)
374
+ def initialize(pdf_file_path, password='')
375
+ @pdf_filename = pdf_file_path
376
+ @pdf_file = Extraction.openPDF(pdf_file_path, password)
377
377
  @all_pages = @pdf_file.getDocumentCatalog.getAllPages
378
+
379
+ @extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
378
380
  end
379
381
 
380
382
  def pages
383
+ found_page_with_texts = false
381
384
  Enumerator.new do |y|
382
385
  begin
383
386
  @all_pages.each_with_index do |page, i|
384
387
  contents = page.getContents
385
388
 
386
- y.yield Tabula::Page.new(@pdf_filename,
389
+ if found_page_with_texts
390
+ page = Tabula::Page.new(@pdf_filename,
387
391
  page.findCropBox.width,
388
392
  page.findCropBox.height,
389
393
  page.getRotation.to_i,
390
394
  i+1) #remember, these are one-indexed
395
+ else
396
+ page = @extractor.extract_page(i+1)
397
+ found_page_with_texts = page.has_text?
398
+ end
399
+
400
+ y.yield page
391
401
  end
392
402
  ensure
393
403
  @pdf_file.close
404
+ @extractor.close!
394
405
  end
395
406
  end
396
407
  end
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.7.5'
2
+ VERSION = '0.7.6'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.5
4
+ version: 0.7.6
5
5
  platform: java
6
6
  authors:
7
7
  - Manuel Aristarán
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2014-09-29 00:00:00.000000000 Z
13
+ date: 2015-01-31 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: bundler
@@ -94,7 +94,6 @@ files:
94
94
  - .travis.yml
95
95
  - AUTHORS.md
96
96
  - Gemfile
97
- - Gemfile.lock
98
97
  - LICENSE.md
99
98
  - NOTICE.txt
100
99
  - README.md
@@ -147,7 +146,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
147
146
  version: '0'
148
147
  requirements: []
149
148
  rubyforge_project:
150
- rubygems_version: 2.4.1
149
+ rubygems_version: 2.1.9
151
150
  signing_key:
152
151
  specification_version: 4
153
152
  summary: extract tables from PDF files
@@ -1,39 +0,0 @@
1
- PATH
2
- remote: .
3
- specs:
4
- tabula-extractor (0.7.5-java)
5
- trollop (~> 2.0)
6
-
7
- GEM
8
- remote: http://rubygems.org/
9
- specs:
10
- coderay (1.1.0)
11
- columnize (0.8.9)
12
- ffi (1.9.5-java)
13
- method_source (0.8.2)
14
- minitest (5.4.2)
15
- pry (0.10.1-java)
16
- coderay (~> 1.1.0)
17
- method_source (~> 0.8.1)
18
- slop (~> 3.4)
19
- spoon (~> 0.0)
20
- rake (10.3.2)
21
- ruby-debug (0.10.4)
22
- columnize (>= 0.1)
23
- ruby-debug-base (~> 0.10.4.0)
24
- ruby-debug-base (0.10.4-java)
25
- slop (3.6.0)
26
- spoon (0.0.4)
27
- ffi
28
- trollop (2.0)
29
-
30
- PLATFORMS
31
- java
32
-
33
- DEPENDENCIES
34
- bundler (>= 1.3.4)
35
- minitest
36
- pry
37
- rake
38
- ruby-debug
39
- tabula-extractor!