tabula-extractor 0.7.5-java → 0.7.6-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -3
- data/lib/tabula/entities/page.rb +5 -1
- data/lib/tabula/entities/ruling.rb +2 -1
- data/lib/tabula/extraction.rb +15 -4
- data/lib/tabula/version.rb +1 -1
- metadata +3 -4
- data/Gemfile.lock +0 -39
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ac7f1daa082acce10e82b94b01b31e07813ad4d
|
4
|
+
data.tar.gz: ac521bbba80d6b0571d904565cd31d9af5e7947a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0389d96e5a7a8ad20c147ed3170b922a501126211bec58b012f39662425599437f3f869002825b562cae57d44645ddb776088804f237e168b71473211a86c67a
|
7
|
+
data.tar.gz: 53dd7bd11684bf8b8ccd03ea9352140eb3dcc346b7018bee7d6b8049e7e70ee02f59b83e360d97ddf5b7f211a1dccbb83694aab8cb016c7a4ba656f46a37c4c4
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
tabula-extractor
|
2
2
|
================
|
3
3
|
|
4
|
-
[](https://travis-ci.org/tabulapdf/tabula-extractor)
|
5
5
|
|
6
6
|
Extract tables from PDF files. `tabula-extractor` is the table extraction engine that powers [Tabula](http://tabula.technology), now available as a library and command line program.
|
7
7
|
|
@@ -125,7 +125,7 @@ extractor.extract.each_with_index do |pdf_page, page_index|
|
|
125
125
|
page_areas = [[250, 0, 325, 1700]]
|
126
126
|
|
127
127
|
page_areas.each do |page_area|
|
128
|
-
out << pdf_page.get_area(page_area).
|
128
|
+
out << pdf_page.get_area(page_area).get_table.to_csv
|
129
129
|
out << "\n\n"
|
130
130
|
end
|
131
131
|
|
@@ -155,7 +155,7 @@ extractor.extract.each_with_index do |pdf_page, page_index|
|
|
155
155
|
vertical_rulings = vertical_ruling_locations.map{|n| Tabula::Ruling.new(0, n * scale_factor, 0, 1000)}
|
156
156
|
|
157
157
|
page_areas.each do |page_area|
|
158
|
-
out << pdf_page.get_area(page_area).
|
158
|
+
out << pdf_page.get_area(page_area).get_table(:vertical_rulings => vertical_rulings).to_csv
|
159
159
|
out << "\n\n"
|
160
160
|
end
|
161
161
|
end
|
data/lib/tabula/entities/page.rb
CHANGED
@@ -191,6 +191,10 @@ module Tabula
|
|
191
191
|
end
|
192
192
|
end
|
193
193
|
|
194
|
+
def has_text?
|
195
|
+
!self.texts.empty?
|
196
|
+
end
|
197
|
+
|
194
198
|
# TODO no need for this, let's choose one name
|
195
199
|
def ruling_lines
|
196
200
|
get_ruling_lines!
|
@@ -258,7 +262,7 @@ module Tabula
|
|
258
262
|
:height => self.height,
|
259
263
|
:number => self.number,
|
260
264
|
:rotation => self.rotation,
|
261
|
-
:
|
265
|
+
:hasText => self.has_text?
|
262
266
|
}.to_json(options)
|
263
267
|
end
|
264
268
|
|
@@ -239,7 +239,8 @@ module Tabula
|
|
239
239
|
end
|
240
240
|
|
241
241
|
def finite?
|
242
|
-
top != ::Float::INFINITY && left != ::Float::INFINITY && bottom != ::Float::INFINITY && right != ::Float::INFINITY
|
242
|
+
top != ::Float::INFINITY && left != ::Float::INFINITY && bottom != ::Float::INFINITY && right != ::Float::INFINITY &&
|
243
|
+
!top.nan? && !left.nan? && !bottom.nan? && !right.nan?
|
243
244
|
end
|
244
245
|
|
245
246
|
##
|
data/lib/tabula/extraction.rb
CHANGED
@@ -371,26 +371,37 @@ module Tabula
|
|
371
371
|
|
372
372
|
|
373
373
|
class PagesInfoExtractor
|
374
|
-
def initialize(
|
375
|
-
@pdf_filename =
|
376
|
-
@pdf_file = Extraction.openPDF(
|
374
|
+
def initialize(pdf_file_path, password='')
|
375
|
+
@pdf_filename = pdf_file_path
|
376
|
+
@pdf_file = Extraction.openPDF(pdf_file_path, password)
|
377
377
|
@all_pages = @pdf_file.getDocumentCatalog.getAllPages
|
378
|
+
|
379
|
+
@extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
|
378
380
|
end
|
379
381
|
|
380
382
|
def pages
|
383
|
+
found_page_with_texts = false
|
381
384
|
Enumerator.new do |y|
|
382
385
|
begin
|
383
386
|
@all_pages.each_with_index do |page, i|
|
384
387
|
contents = page.getContents
|
385
388
|
|
386
|
-
|
389
|
+
if found_page_with_texts
|
390
|
+
page = Tabula::Page.new(@pdf_filename,
|
387
391
|
page.findCropBox.width,
|
388
392
|
page.findCropBox.height,
|
389
393
|
page.getRotation.to_i,
|
390
394
|
i+1) #remember, these are one-indexed
|
395
|
+
else
|
396
|
+
page = @extractor.extract_page(i+1)
|
397
|
+
found_page_with_texts = page.has_text?
|
398
|
+
end
|
399
|
+
|
400
|
+
y.yield page
|
391
401
|
end
|
392
402
|
ensure
|
393
403
|
@pdf_file.close
|
404
|
+
@extractor.close!
|
394
405
|
end
|
395
406
|
end
|
396
407
|
end
|
data/lib/tabula/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.6
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Manuel Aristarán
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2015-01-31 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: bundler
|
@@ -94,7 +94,6 @@ files:
|
|
94
94
|
- .travis.yml
|
95
95
|
- AUTHORS.md
|
96
96
|
- Gemfile
|
97
|
-
- Gemfile.lock
|
98
97
|
- LICENSE.md
|
99
98
|
- NOTICE.txt
|
100
99
|
- README.md
|
@@ -147,7 +146,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
147
146
|
version: '0'
|
148
147
|
requirements: []
|
149
148
|
rubyforge_project:
|
150
|
-
rubygems_version: 2.
|
149
|
+
rubygems_version: 2.1.9
|
151
150
|
signing_key:
|
152
151
|
specification_version: 4
|
153
152
|
summary: extract tables from PDF files
|
data/Gemfile.lock
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
tabula-extractor (0.7.5-java)
|
5
|
-
trollop (~> 2.0)
|
6
|
-
|
7
|
-
GEM
|
8
|
-
remote: http://rubygems.org/
|
9
|
-
specs:
|
10
|
-
coderay (1.1.0)
|
11
|
-
columnize (0.8.9)
|
12
|
-
ffi (1.9.5-java)
|
13
|
-
method_source (0.8.2)
|
14
|
-
minitest (5.4.2)
|
15
|
-
pry (0.10.1-java)
|
16
|
-
coderay (~> 1.1.0)
|
17
|
-
method_source (~> 0.8.1)
|
18
|
-
slop (~> 3.4)
|
19
|
-
spoon (~> 0.0)
|
20
|
-
rake (10.3.2)
|
21
|
-
ruby-debug (0.10.4)
|
22
|
-
columnize (>= 0.1)
|
23
|
-
ruby-debug-base (~> 0.10.4.0)
|
24
|
-
ruby-debug-base (0.10.4-java)
|
25
|
-
slop (3.6.0)
|
26
|
-
spoon (0.0.4)
|
27
|
-
ffi
|
28
|
-
trollop (2.0)
|
29
|
-
|
30
|
-
PLATFORMS
|
31
|
-
java
|
32
|
-
|
33
|
-
DEPENDENCIES
|
34
|
-
bundler (>= 1.3.4)
|
35
|
-
minitest
|
36
|
-
pry
|
37
|
-
rake
|
38
|
-
ruby-debug
|
39
|
-
tabula-extractor!
|