tabula-extractor 0.7.5-java → 0.7.6-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -3
- data/lib/tabula/entities/page.rb +5 -1
- data/lib/tabula/entities/ruling.rb +2 -1
- data/lib/tabula/extraction.rb +15 -4
- data/lib/tabula/version.rb +1 -1
- metadata +3 -4
- data/Gemfile.lock +0 -39
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ac7f1daa082acce10e82b94b01b31e07813ad4d
|
4
|
+
data.tar.gz: ac521bbba80d6b0571d904565cd31d9af5e7947a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0389d96e5a7a8ad20c147ed3170b922a501126211bec58b012f39662425599437f3f869002825b562cae57d44645ddb776088804f237e168b71473211a86c67a
|
7
|
+
data.tar.gz: 53dd7bd11684bf8b8ccd03ea9352140eb3dcc346b7018bee7d6b8049e7e70ee02f59b83e360d97ddf5b7f211a1dccbb83694aab8cb016c7a4ba656f46a37c4c4
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
tabula-extractor
|
2
2
|
================
|
3
3
|
|
4
|
-
[![Build Status](https://travis-ci.org/
|
4
|
+
[![Build Status](https://travis-ci.org/tabulapdf/tabula-extractor.png)](https://travis-ci.org/tabulapdf/tabula-extractor)
|
5
5
|
|
6
6
|
Extract tables from PDF files. `tabula-extractor` is the table extraction engine that powers [Tabula](http://tabula.technology), now available as a library and command line program.
|
7
7
|
|
@@ -125,7 +125,7 @@ extractor.extract.each_with_index do |pdf_page, page_index|
|
|
125
125
|
page_areas = [[250, 0, 325, 1700]]
|
126
126
|
|
127
127
|
page_areas.each do |page_area|
|
128
|
-
out << pdf_page.get_area(page_area).
|
128
|
+
out << pdf_page.get_area(page_area).get_table.to_csv
|
129
129
|
out << "\n\n"
|
130
130
|
end
|
131
131
|
|
@@ -155,7 +155,7 @@ extractor.extract.each_with_index do |pdf_page, page_index|
|
|
155
155
|
vertical_rulings = vertical_ruling_locations.map{|n| Tabula::Ruling.new(0, n * scale_factor, 0, 1000)}
|
156
156
|
|
157
157
|
page_areas.each do |page_area|
|
158
|
-
out << pdf_page.get_area(page_area).
|
158
|
+
out << pdf_page.get_area(page_area).get_table(:vertical_rulings => vertical_rulings).to_csv
|
159
159
|
out << "\n\n"
|
160
160
|
end
|
161
161
|
end
|
data/lib/tabula/entities/page.rb
CHANGED
@@ -191,6 +191,10 @@ module Tabula
|
|
191
191
|
end
|
192
192
|
end
|
193
193
|
|
194
|
+
def has_text?
|
195
|
+
!self.texts.empty?
|
196
|
+
end
|
197
|
+
|
194
198
|
# TODO no need for this, let's choose one name
|
195
199
|
def ruling_lines
|
196
200
|
get_ruling_lines!
|
@@ -258,7 +262,7 @@ module Tabula
|
|
258
262
|
:height => self.height,
|
259
263
|
:number => self.number,
|
260
264
|
:rotation => self.rotation,
|
261
|
-
:
|
265
|
+
:hasText => self.has_text?
|
262
266
|
}.to_json(options)
|
263
267
|
end
|
264
268
|
|
@@ -239,7 +239,8 @@ module Tabula
|
|
239
239
|
end
|
240
240
|
|
241
241
|
def finite?
|
242
|
-
top != ::Float::INFINITY && left != ::Float::INFINITY && bottom != ::Float::INFINITY && right != ::Float::INFINITY
|
242
|
+
top != ::Float::INFINITY && left != ::Float::INFINITY && bottom != ::Float::INFINITY && right != ::Float::INFINITY &&
|
243
|
+
!top.nan? && !left.nan? && !bottom.nan? && !right.nan?
|
243
244
|
end
|
244
245
|
|
245
246
|
##
|
data/lib/tabula/extraction.rb
CHANGED
@@ -371,26 +371,37 @@ module Tabula
|
|
371
371
|
|
372
372
|
|
373
373
|
class PagesInfoExtractor
|
374
|
-
def initialize(
|
375
|
-
@pdf_filename =
|
376
|
-
@pdf_file = Extraction.openPDF(
|
374
|
+
def initialize(pdf_file_path, password='')
|
375
|
+
@pdf_filename = pdf_file_path
|
376
|
+
@pdf_file = Extraction.openPDF(pdf_file_path, password)
|
377
377
|
@all_pages = @pdf_file.getDocumentCatalog.getAllPages
|
378
|
+
|
379
|
+
@extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
|
378
380
|
end
|
379
381
|
|
380
382
|
def pages
|
383
|
+
found_page_with_texts = false
|
381
384
|
Enumerator.new do |y|
|
382
385
|
begin
|
383
386
|
@all_pages.each_with_index do |page, i|
|
384
387
|
contents = page.getContents
|
385
388
|
|
386
|
-
|
389
|
+
if found_page_with_texts
|
390
|
+
page = Tabula::Page.new(@pdf_filename,
|
387
391
|
page.findCropBox.width,
|
388
392
|
page.findCropBox.height,
|
389
393
|
page.getRotation.to_i,
|
390
394
|
i+1) #remember, these are one-indexed
|
395
|
+
else
|
396
|
+
page = @extractor.extract_page(i+1)
|
397
|
+
found_page_with_texts = page.has_text?
|
398
|
+
end
|
399
|
+
|
400
|
+
y.yield page
|
391
401
|
end
|
392
402
|
ensure
|
393
403
|
@pdf_file.close
|
404
|
+
@extractor.close!
|
394
405
|
end
|
395
406
|
end
|
396
407
|
end
|
data/lib/tabula/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.6
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Manuel Aristarán
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2015-01-31 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: bundler
|
@@ -94,7 +94,6 @@ files:
|
|
94
94
|
- .travis.yml
|
95
95
|
- AUTHORS.md
|
96
96
|
- Gemfile
|
97
|
-
- Gemfile.lock
|
98
97
|
- LICENSE.md
|
99
98
|
- NOTICE.txt
|
100
99
|
- README.md
|
@@ -147,7 +146,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
147
146
|
version: '0'
|
148
147
|
requirements: []
|
149
148
|
rubyforge_project:
|
150
|
-
rubygems_version: 2.
|
149
|
+
rubygems_version: 2.1.9
|
151
150
|
signing_key:
|
152
151
|
specification_version: 4
|
153
152
|
summary: extract tables from PDF files
|
data/Gemfile.lock
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
tabula-extractor (0.7.5-java)
|
5
|
-
trollop (~> 2.0)
|
6
|
-
|
7
|
-
GEM
|
8
|
-
remote: http://rubygems.org/
|
9
|
-
specs:
|
10
|
-
coderay (1.1.0)
|
11
|
-
columnize (0.8.9)
|
12
|
-
ffi (1.9.5-java)
|
13
|
-
method_source (0.8.2)
|
14
|
-
minitest (5.4.2)
|
15
|
-
pry (0.10.1-java)
|
16
|
-
coderay (~> 1.1.0)
|
17
|
-
method_source (~> 0.8.1)
|
18
|
-
slop (~> 3.4)
|
19
|
-
spoon (~> 0.0)
|
20
|
-
rake (10.3.2)
|
21
|
-
ruby-debug (0.10.4)
|
22
|
-
columnize (>= 0.1)
|
23
|
-
ruby-debug-base (~> 0.10.4.0)
|
24
|
-
ruby-debug-base (0.10.4-java)
|
25
|
-
slop (3.6.0)
|
26
|
-
spoon (0.0.4)
|
27
|
-
ffi
|
28
|
-
trollop (2.0)
|
29
|
-
|
30
|
-
PLATFORMS
|
31
|
-
java
|
32
|
-
|
33
|
-
DEPENDENCIES
|
34
|
-
bundler (>= 1.3.4)
|
35
|
-
minitest
|
36
|
-
pry
|
37
|
-
rake
|
38
|
-
ruby-debug
|
39
|
-
tabula-extractor!
|