iguvium 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +123 -0
- data/Rakefile +8 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/iguvium +51 -0
- data/iguvium.gemspec +40 -0
- data/lib/iguvium/cv.rb +194 -0
- data/lib/iguvium/image.rb +28 -0
- data/lib/iguvium/labeler.rb +132 -0
- data/lib/iguvium/page.rb +88 -0
- data/lib/iguvium/table.rb +93 -0
- data/lib/iguvium/version.rb +5 -0
- data/lib/iguvium.rb +125 -0
- metadata +163 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 50748a9a17471a1f0f58a108ee632344cafad4c0ee6ab6096dbdb297c53b1381
|
4
|
+
data.tar.gz: 6e676513f8aaee937f2dbb28316452a0dcd38f850afdc31a5ef6685200189814
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 4b0c5fc257ae4e8b667f0611f708e744c806de1f319e016e42df3553273c0cda445fe23f29e76ed8ef980eaf4680c21ecd7dccdf4602b721e5731981eace3097
|
7
|
+
data.tar.gz: 563368859fa3684b8baa54f25e788c91f76d46c71b56bfa271fd9d6db2b9e6f74ad4d5937488b42c5f0912fdcc77e09432c214fde7d5c76dca85884e594912cf
|
data/.gitignore
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
/.bundle/
|
2
|
+
/.yardoc
|
3
|
+
/_yardoc/
|
4
|
+
/coverage/
|
5
|
+
/doc/
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/tmp/
|
9
|
+
.idea/
|
10
|
+
|
11
|
+
Gemfile.lock
|
12
|
+
|
13
|
+
*.gem
|
14
|
+
prof*
|
15
|
+
|
16
|
+
.DS_Store
|
17
|
+
._.DS_Store
|
18
|
+
**/.DS_Store
|
19
|
+
**/._.DS_Store
|
20
|
+
|
21
|
+
# rspec failure tracking
|
22
|
+
.rspec_status
|
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2018 adworse
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
# Iguvium
|
2
|
+
[![Build Status](https://travis-ci.com/adworse/iguvium.svg?token=pKH4s9rC7sLLfFxdq8b6&branch=master)](https://travis-ci.com/adworse/iguvium)
|
3
|
+
|
4
|
+
Iguvium extracts tables from PDF file in a structured form. It works like this.
|
5
|
+
|
6
|
+
Take this PDF file:
|
7
|
+
|
8
|
+
![PDF Table](https://user-images.githubusercontent.com/8277078/48663021-ba81e580-ea92-11e8-8ca6-53c5cd5c7b1b.png)
|
9
|
+
|
10
|
+
Use this code:
|
11
|
+
|
12
|
+
```
|
13
|
+
pages = Iguvium.read('filename.pdf')
|
14
|
+
tables = pages[1].extract_tables!
|
15
|
+
csv = tables.first.to_a.map(&:to_csv).join
|
16
|
+
```
|
17
|
+
|
18
|
+
Get this table:
|
19
|
+
|
20
|
+
![Spreadsheet](https://user-images.githubusercontent.com/8277078/48663073-822ed700-ea93-11e8-8924-9974ab5da27b.png)
|
21
|
+
|
22
|
+
## Features/Limitations:
|
23
|
+
* Iguvium renders pdf into an image, looks for table-like graphic structure and tries to place characters into detected cells.
|
24
|
+
|
25
|
+
* Characters extraction is done by [PDF::Reader gem](https://github.com/yob/pdf-reader). Some PDFs are so messed up it can't extract meaningful text from them. If so, so does Iguvium.
|
26
|
+
|
27
|
+
* Current version extracts regular (with constant number of rows per column and vise versa) tables with explicit lines formatting, like this:
|
28
|
+
|
29
|
+
```
|
30
|
+
.__________________.
|
31
|
+
|____|_______|_____|
|
32
|
+
|____|_______|_____|
|
33
|
+
|____|_______|_____|
|
34
|
+
```
|
35
|
+
Merged cells content is split as if cells were not merged.
|
36
|
+
|
37
|
+
* Performance: considering the fact it has computer vision under the hood, the gem is reasonably fast. Full page extraction takes up to 1 second on modern CPUs and up to 2 seconds on the older ones.
|
38
|
+
|
39
|
+
|
40
|
+
## Installation
|
41
|
+
|
42
|
+
Make sure you have Ghostscript installed.
|
43
|
+
|
44
|
+
Linux: `sudo apt-get install ghostscript`
|
45
|
+
|
46
|
+
Mac: `brew install ghostscript`
|
47
|
+
|
48
|
+
Windows: download installer from the official [download page](https://www.ghostscript.com/download/gsdnld.html).
|
49
|
+
|
50
|
+
Add this line to your application's Gemfile:
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
gem 'iguvium'
|
54
|
+
```
|
55
|
+
|
56
|
+
And then execute:
|
57
|
+
|
58
|
+
$ bundle install
|
59
|
+
|
60
|
+
Or install it yourself as:
|
61
|
+
|
62
|
+
$ gem install iguvium
|
63
|
+
|
64
|
+
## Usage
|
65
|
+
|
66
|
+
#### Get all the tables in 2D text array format
|
67
|
+
```
|
68
|
+
pages = Iguvium.read('filename.pdf') #=> [Array<Iguvium::Page>]
|
69
|
+
tables = pages.flat_map { |page| page.extract_tables! } #=> [Array<Iguvium::Table>]
|
70
|
+
tables.map(&:to_a)
|
71
|
+
```
|
72
|
+
#### Get first table from the page 8
|
73
|
+
```
|
74
|
+
pages = Iguvium.read('filename.pdf')
|
75
|
+
tables = pages[7].extract_tables!
|
76
|
+
tables.first.to_a
|
77
|
+
```
|
78
|
+
|
79
|
+
## CLI
|
80
|
+
|
81
|
+
Gem installation adds a command-line utility to the system. It's a simple wrapper:
|
82
|
+
|
83
|
+
```
|
84
|
+
iguvium filename.pdf [options]
|
85
|
+
-p, --pages page numbers, comma-separated, no spaces
|
86
|
+
-i, --images use pictures in pdf (usually a bad idea)
|
87
|
+
-n, --newlines keep newlines
|
88
|
+
--verbose verbose output
|
89
|
+
```
|
90
|
+
|
91
|
+
Given a filename, it generates CSV files for the tables detected
|
92
|
+
|
93
|
+
## Implementation details
|
94
|
+
There are usually no actual tables in PDFs, only characters with coordinates,
|
95
|
+
and some fancy lines. Human eye interprets this as a table. Iguvium behaves quite similarly. It prints PDF to an image file with GhostScript, then analyses the image.
|
96
|
+
|
97
|
+
Long enough continuous edges are interpreted as possible cell borders. Gaussian blur is applied beforehand to get rid of possible inconsistencies and style features.
|
98
|
+
|
99
|
+
Initially inspired by [camelot](https://github.com/socialcopsdev/camelot/) idea of image analysis to detect table structure. Besides this idea, is an independent work. Image recognition is written in Ruby, no OpenCV or other heavy computer vision libraries are used. Line detection algorithms are different too. The functionality of Camelot is significantly broader.
|
100
|
+
|
101
|
+
## Roadmap
|
102
|
+
|
103
|
+
The next version will deal with open-edged tables like
|
104
|
+
|
105
|
+
```
|
106
|
+
__|____|_______|_____|
|
107
|
+
__|____|_______|_____|
|
108
|
+
__|____|_______|_____|
|
109
|
+
```
|
110
|
+
|
111
|
+
It also will keep open-edged rows metadata ('floorless' and 'roofless') for the needs of multipage tables merger.
|
112
|
+
|
113
|
+
The final one will recognize tables with merged cells.
|
114
|
+
|
115
|
+
There are at the moment no plans to design recognition of whitespace-separated tables.
|
116
|
+
|
117
|
+
## License
|
118
|
+
|
119
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
120
|
+
|
121
|
+
## Name
|
122
|
+
|
123
|
+
Just a place (ancient) where some [tables](https://en.wikipedia.org/wiki/Iguvine_Tablets) (incredibly cool ones) were found.
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "iguvium"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/exe/iguvium
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'csv'
|
5
|
+
require 'iguvium'
|
6
|
+
require 'slop'
|
7
|
+
|
8
|
+
opts = Slop.parse { |o|
|
9
|
+
o.array '-p', '--pages', 'page numbers, comma-separated, no spaces'
|
10
|
+
o.bool '-i', '--images', 'use pictures in pdf (usually a bad idea)'
|
11
|
+
o.bool '-n', '--newlines', 'keep newlines'
|
12
|
+
o.bool '--verbose', 'verbose output'
|
13
|
+
o.on '--version', 'print the version' do
|
14
|
+
puts Iguvium::VERSION
|
15
|
+
exit
|
16
|
+
end
|
17
|
+
o.on '-h', '--help', 'show help' do
|
18
|
+
puts o.to_s.gsub(/(usage:).+(iguvium)/, '\1 \2 filename.pdf')
|
19
|
+
exit
|
20
|
+
end
|
21
|
+
}
|
22
|
+
|
23
|
+
path = opts.args.first
|
24
|
+
|
25
|
+
abort('No file name given') unless path
|
26
|
+
|
27
|
+
opts = opts.to_hash
|
28
|
+
page_numbers = opts.delete(:pages).map { |number| number.to_i - 1 }
|
29
|
+
|
30
|
+
pages = Iguvium.read(path, opts)
|
31
|
+
|
32
|
+
Iguvium.logger.level = Logger::INFO if opts[:verbose]
|
33
|
+
|
34
|
+
page_numbers = pages.count.times.to_a if page_numbers.empty?
|
35
|
+
|
36
|
+
# puts page_numbers.inspect
|
37
|
+
# puts opts.to_hash.inspect
|
38
|
+
|
39
|
+
page_numbers.each do |number|
|
40
|
+
print "Extracting page #{number + 1}... "
|
41
|
+
tables = pages[number].extract_tables!(images: opts[:images])
|
42
|
+
tables.empty? ? puts('no tables found') : puts
|
43
|
+
|
44
|
+
tables.each_with_index do |table, i|
|
45
|
+
csv = table.to_a(newlines: opts[:newlines]).map(&:to_csv).join
|
46
|
+
next if csv.empty?
|
47
|
+
csv_file = "#{path.gsub(/\.pdf$/, '')}_page_#{number + 1}_table_#{i}.csv"
|
48
|
+
puts "Saving #{File.expand_path(csv_file, __dir__)}"
|
49
|
+
File.write csv_file, csv
|
50
|
+
end
|
51
|
+
end
|
data/iguvium.gemspec
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'iguvium/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |spec|
|
8
|
+
spec.name = 'iguvium'
|
9
|
+
spec.version = Iguvium::VERSION
|
10
|
+
spec.authors = ['Dima Ermilov']
|
11
|
+
spec.email = ['dima@scriptangle.com']
|
12
|
+
|
13
|
+
spec.summary = 'Extract tables from PDF as a structured info'
|
14
|
+
spec.description = 'Extract tables from PDF as a structured info. Uses ghostscript to print pdf to image, \
|
15
|
+
then recognizes table separators optically. No OpenCV or other heavy dependencies'
|
16
|
+
spec.homepage = 'https://github.com/adworse/iguvium'
|
17
|
+
spec.license = 'MIT'
|
18
|
+
|
19
|
+
# Specify which files should be added to the gem when it is released.
|
20
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
21
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
22
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
23
|
+
end
|
24
|
+
|
25
|
+
spec.bindir = 'exe'
|
26
|
+
# spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
27
|
+
|
28
|
+
spec.executables = ['iguvium']
|
29
|
+
|
30
|
+
spec.require_paths = ['lib']
|
31
|
+
|
32
|
+
spec.add_dependency 'pdf-reader', '~> 2.1'
|
33
|
+
spec.add_dependency 'convolver-light', '~> 0.3.1'
|
34
|
+
spec.add_dependency 'oily_png', '~> 1.2'
|
35
|
+
spec.add_dependency 'slop', '~> 4.2'
|
36
|
+
|
37
|
+
spec.add_development_dependency 'bundler', '~> 1.16'
|
38
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
39
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
40
|
+
end
|
data/lib/iguvium/cv.rb
ADDED
@@ -0,0 +1,194 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Iguvium
|
4
|
+
GAUSS = NArray[
|
5
|
+
[0.0125786, 0.0251572, 0.0314465, 0.0251572, 0.0125786],
|
6
|
+
[0.0251572, 0.0566038, 0.0754717, 0.0566038, 0.0251572],
|
7
|
+
[0.0314465, 0.0754717, 0.0943396, 0.0754717, 0.0314465],
|
8
|
+
[0.0251572, 0.0566038, 0.0754717, 0.0566038, 0.0251572],
|
9
|
+
[0.0125786, 0.0251572, 0.0314465, 0.0251572, 0.0125786]
|
10
|
+
]
|
11
|
+
|
12
|
+
HORIZONTAL = NArray[
|
13
|
+
[-1, -1, -1],
|
14
|
+
[2, 2, 2],
|
15
|
+
[-1, -1, -1]
|
16
|
+
]
|
17
|
+
|
18
|
+
VERTICAL = NArray[
|
19
|
+
[-1, 2, -1],
|
20
|
+
[-1, 2, -1],
|
21
|
+
[-1, 2, -1]
|
22
|
+
]
|
23
|
+
|
24
|
+
private_constant :GAUSS, :HORIZONTAL, :VERTICAL
|
25
|
+
|
26
|
+
# Performs all the computer vision job except table composition.
|
27
|
+
# Edge detection is performed using simplified two-directional version of
|
28
|
+
# Canny edge detection operator applied to rows and columns as integer vectors
|
29
|
+
class CV
|
30
|
+
# @!attribute lines
|
31
|
+
# @return [Hash] :vertical and :horizontal lines. Horizontal lines are [Array<Range, Integer>],
|
32
|
+
# vertical have [Integer] on x position
|
33
|
+
# @!attribute boxes
|
34
|
+
# @return [Array<Range, Range>] X range, Y range
|
35
|
+
|
36
|
+
# Keeps recognized data
|
37
|
+
Recognized = Struct.new(:lines, :boxes)
|
38
|
+
|
39
|
+
# Prepares image for recognition: initial blur
|
40
|
+
# @param image [ChunkyPNG::Image] from {Iguvium::Image.read}
|
41
|
+
def initialize(image)
|
42
|
+
@image = blur image
|
43
|
+
end
|
44
|
+
|
45
|
+
# @return [Array] 8-bit representation of an image
|
46
|
+
attr_reader :image
|
47
|
+
|
48
|
+
# @return [Recognized]
|
49
|
+
# lines most probably forming table cells and tables' outer borders as boxes
|
50
|
+
def recognize
|
51
|
+
Recognized.new(lines, boxes)
|
52
|
+
# {
|
53
|
+
# lines: lines,
|
54
|
+
# boxes: boxes
|
55
|
+
# }
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def lines
|
61
|
+
@lines ||=
|
62
|
+
{
|
63
|
+
vertical: Labeler.new(verticals)
|
64
|
+
.lines
|
65
|
+
.map { |line| flip_line line }
|
66
|
+
.sort_by { |x, yrange| [yrange.begin, x] },
|
67
|
+
horizontal: Labeler.new(horizontals).lines.map { |line| flip_line line }.sort_by { |_, y| [y] }
|
68
|
+
}
|
69
|
+
end
|
70
|
+
|
71
|
+
def boxes
|
72
|
+
return @boxes if @boxes
|
73
|
+
|
74
|
+
brightest = image.flatten.max
|
75
|
+
@boxes = Labeler.new(
|
76
|
+
# image.map { |row| row.map { |pix| 255 - pix } }
|
77
|
+
image.map { |row| row.map { |pix| pix < brightest } }
|
78
|
+
).clusters.map { |cluster| box cluster }.sort_by { |xrange, yrange| [yrange.begin, xrange.begin] }
|
79
|
+
end
|
80
|
+
|
81
|
+
def verticals(threshold = 3)
|
82
|
+
Matrix
|
83
|
+
.rows(convolve(NArray[*horizontal_scan(image)], VERTICAL, 0).to_a)
|
84
|
+
.map { |pix| pix < threshold ? nil : pix }
|
85
|
+
.to_a
|
86
|
+
end
|
87
|
+
|
88
|
+
def horizontals(threshold = 3)
|
89
|
+
Matrix
|
90
|
+
.rows(convolve(NArray[*vertical_scan(image)], HORIZONTAL, 0).to_a)
|
91
|
+
.map { |pix| pix < threshold ? nil : pix }
|
92
|
+
.to_a
|
93
|
+
end
|
94
|
+
|
95
|
+
# START OF FLIPPER CODE
|
96
|
+
def flip_y(coord)
|
97
|
+
@height ||= image.count
|
98
|
+
@height - coord - 1
|
99
|
+
end
|
100
|
+
|
101
|
+
def flip_range(range)
|
102
|
+
flip_y(range.end)..flip_y(range.begin)
|
103
|
+
end
|
104
|
+
|
105
|
+
def flip_line(line)
|
106
|
+
y = line.last
|
107
|
+
y = if y.is_a?(Numeric)
|
108
|
+
flip_y y
|
109
|
+
elsif y.is_a?(Range)
|
110
|
+
flip_range y
|
111
|
+
else
|
112
|
+
raise ArgumentError, 'WTF?!'
|
113
|
+
end
|
114
|
+
|
115
|
+
[line.first, y]
|
116
|
+
end
|
117
|
+
|
118
|
+
# END OF FLIPPER CODE
|
119
|
+
|
120
|
+
def blur(image)
|
121
|
+
convolve(to_narray(image), GAUSS).to_a
|
122
|
+
end
|
123
|
+
|
124
|
+
def convolve(narray, conv, border_value = 255)
|
125
|
+
narray = border(narray, conv.shape.first / 2, border_value)
|
126
|
+
Convolver.convolve(narray, conv).ceil
|
127
|
+
end
|
128
|
+
|
129
|
+
def array_border(array, width, value = 0)
|
130
|
+
hl = Array.new width, Array.new(array.first.count + width * 2, value)
|
131
|
+
hl + array.map { |row| [value] * width + row + [value] * width } + hl
|
132
|
+
end
|
133
|
+
|
134
|
+
def border(narray, width, value = 0)
|
135
|
+
NArray[*array_border(narray.to_a, width, value)]
|
136
|
+
end
|
137
|
+
|
138
|
+
def to_narray(image)
|
139
|
+
palette = image.pixels.uniq
|
140
|
+
# Precalculation looks stupid but spares up to 0.35 seconds on calculation depending on colorspace width
|
141
|
+
dict = palette.zip(
|
142
|
+
palette.map { |color| ChunkyPNG::Color.grayscale_teint ChunkyPNG::Color.compose(color, 0xffffffff) }
|
143
|
+
).to_h
|
144
|
+
NArray[
|
145
|
+
image.pixels.map { |color| dict[color] }
|
146
|
+
].reshape(image.width, image.height)
|
147
|
+
end
|
148
|
+
|
149
|
+
# def minimums_old(ary)
|
150
|
+
# ary.each_cons(2)
|
151
|
+
# .each_with_index
|
152
|
+
# .map { |(a, b), i| [i + 1, a <=> b] }
|
153
|
+
# .slice_when { |a, b| a.last != -1 && b.last == -1 }
|
154
|
+
# .to_a
|
155
|
+
# .map { |seq| seq.reverse.detect do |a| a.last == 1 end&.first }
|
156
|
+
# .compact
|
157
|
+
# end
|
158
|
+
|
159
|
+
def minimums(ary)
|
160
|
+
# This ugly piece of code takes ~200 ms per page scan to run vs ~700 ms for the prettier old one
|
161
|
+
i = 0
|
162
|
+
mins = []
|
163
|
+
local = 0
|
164
|
+
while i + 2 < ary.length
|
165
|
+
local = i + 1 if ary[i] > ary[i + 1]
|
166
|
+
mins << local if ary[i] >= ary[i + 1] && ary[i + 1] < ary[i + 2]
|
167
|
+
i += 1
|
168
|
+
end
|
169
|
+
mins.uniq
|
170
|
+
end
|
171
|
+
|
172
|
+
def edges(vector)
|
173
|
+
Array
|
174
|
+
.new(vector.count)
|
175
|
+
.tap { |ary| minimums(vector).each { |i| ary[i] = 1 } }
|
176
|
+
end
|
177
|
+
|
178
|
+
def horizontal_scan(image)
|
179
|
+
image.map { |row| edges row }
|
180
|
+
end
|
181
|
+
|
182
|
+
def vertical_scan(image)
|
183
|
+
image.transpose.map { |row| edges row }.transpose
|
184
|
+
end
|
185
|
+
|
186
|
+
def box(coord_array)
|
187
|
+
ax, bx = coord_array.map(&:last).minmax
|
188
|
+
ay, by = coord_array.map(&:first).minmax
|
189
|
+
# additional pixels removed from the box definition
|
190
|
+
# [ax - 1..bx + 1, ay - 1..by + 1]
|
191
|
+
[ax..bx, flip_range(ay..by)]
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Iguvium
|
4
|
+
# PDF to image converter
|
5
|
+
class Image
|
6
|
+
# Prints single page without text to .rgb file and reads it back to memory
|
7
|
+
#
|
8
|
+
# @param path [String] path to PDF file to be read
|
9
|
+
# @param pagenumber [Integer] number of page, first page is 1, not 0
|
10
|
+
#
|
11
|
+
# @option opts [Boolean] :images (false) consider pictures in PDF as possible table separators
|
12
|
+
# @option opts [String] :gspath (nil) explicit path to the GhostScript executable. Use it in case of
|
13
|
+
# non-standard gs executable placement. If not specified, gem tries standard options
|
14
|
+
# like C:/Program Files/gs/gs*/bin/gswin??c.exe on Windows or just gs on Mac and Linux
|
15
|
+
#
|
16
|
+
# @return [ChunkyPNG::Image]
|
17
|
+
def self.read(path, pagenumber = 1, **opts)
|
18
|
+
rgb = path.gsub(/\.pdf$/, '.rgb')
|
19
|
+
Iguvium.logger.info `#{opts[:gspath]} -dSAFER -dBATCH -dNOPAUSE -sDEVICE=pnggray -dGraphicsAlphaBits=4 \
|
20
|
+
-r72 -dFirstPage=#{pagenumber} -dLastPage=#{pagenumber} \
|
21
|
+
-dFILTERTEXT #{'-dFILTERIMAGE' unless opts[:images]} -sOutputFile=#{rgb} #{path} 2>&1`
|
22
|
+
|
23
|
+
image = ChunkyPNG::Image.from_file(rgb)
|
24
|
+
File.delete rgb
|
25
|
+
image
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Iguvium
|
4
|
+
NEIGHBORS = [[0, -1], [-1, -1], [-1, 0], [-1, 1]].freeze
|
5
|
+
FLAT_THRESHOLD = 0.2
|
6
|
+
|
7
|
+
private_constant :NEIGHBORS, :FLAT_THRESHOLD
|
8
|
+
|
9
|
+
# Clusterizes connected pixels using two-pass connected component labelling algorithm (Hoshen-Kopelman),
|
10
|
+
# 8-connectivity is used. Line-like groups are then flattened using simplified dispersion ratio
|
11
|
+
class Labeler
|
12
|
+
# @param image [Array<Boolean>] should be an Array, binarized image
|
13
|
+
# w/ falsy elements as background and anything truthy as pixels
|
14
|
+
#
|
15
|
+
def initialize(image)
|
16
|
+
@image = image
|
17
|
+
rows = image.count
|
18
|
+
cols = image.first.count
|
19
|
+
@labels = Array.new(rows) { Array.new(cols) }
|
20
|
+
@equalities = []
|
21
|
+
end
|
22
|
+
|
23
|
+
# @return [Hash] vertical and horizontal lines detected
|
24
|
+
def lines
|
25
|
+
clusters.map { |cluster| flatten_cluster cluster }.compact
|
26
|
+
end
|
27
|
+
|
28
|
+
# @return [Array<Array>] coordinates of connected pixels grouped together
|
29
|
+
def clusters
|
30
|
+
accumulator = Hash.new { |h, k| h[k] = [] }
|
31
|
+
label.each_index do |row|
|
32
|
+
labels[row].each_index do |column|
|
33
|
+
pix = labels[row][column]
|
34
|
+
next unless pix
|
35
|
+
|
36
|
+
accumulator[pix] << [row, column]
|
37
|
+
end
|
38
|
+
end
|
39
|
+
accumulator.values
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
attr_reader :image, :labels
|
45
|
+
|
46
|
+
def label
|
47
|
+
pass_one
|
48
|
+
@equalities = @equalities.map { |a| resolve a }
|
49
|
+
image.each_index do |row|
|
50
|
+
image[row].each_index do |column|
|
51
|
+
next unless labels[row][column]
|
52
|
+
|
53
|
+
@labels[row][column] = @equalities[labels[row][column]]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
labels
|
57
|
+
end
|
58
|
+
|
59
|
+
def flatten_cluster(cluster)
|
60
|
+
xs = cluster.map(&:last)
|
61
|
+
ys = cluster.map(&:first)
|
62
|
+
|
63
|
+
if xs.uniq.count / xs.count.to_f < FLAT_THRESHOLD
|
64
|
+
[xs.max_by { |i| xs.count i }, ys.min..ys.max]
|
65
|
+
elsif ys.uniq.count / ys.count.to_f < FLAT_THRESHOLD
|
66
|
+
[xs.min..xs.max, ys.max_by { |i| ys.count i }]
|
67
|
+
else
|
68
|
+
Iguvium.logger.warn "NonFlattable, #{cluster.inspect}"
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def resolve(num)
|
74
|
+
resolved = @equalities[num]
|
75
|
+
resolved == num ? resolved : resolve(resolved)
|
76
|
+
end
|
77
|
+
|
78
|
+
def neighbors(row, col)
|
79
|
+
NEIGHBORS.map { |roffset, coffset|
|
80
|
+
r = row + roffset
|
81
|
+
c = col + coffset
|
82
|
+
labels.dig(r, c) unless [r, c].min < 0
|
83
|
+
}.compact
|
84
|
+
end
|
85
|
+
|
86
|
+
def neighbors2(row, col)
|
87
|
+
neighbors = []
|
88
|
+
NEIGHBORS.each do |roffset, coffset|
|
89
|
+
r = row + roffset
|
90
|
+
c = col + coffset
|
91
|
+
next if r < 0 || c < 0
|
92
|
+
|
93
|
+
label = labels[r][c]
|
94
|
+
neighbors << label if label
|
95
|
+
end
|
96
|
+
neighbors
|
97
|
+
end
|
98
|
+
|
99
|
+
def pass_one
|
100
|
+
next_label = 0
|
101
|
+
image.each_index do |row|
|
102
|
+
image[row].each_index do |column|
|
103
|
+
next unless image[row][column]
|
104
|
+
|
105
|
+
neighbors = neighbors row, column
|
106
|
+
|
107
|
+
if neighbors.empty?
|
108
|
+
@equalities[next_label] = next_label
|
109
|
+
@labels[row][column] = next_label
|
110
|
+
next_label += 1
|
111
|
+
else
|
112
|
+
neighbors.uniq!
|
113
|
+
neighbors.sort!
|
114
|
+
min = neighbors.shift
|
115
|
+
@labels[row][column] = min
|
116
|
+
count = neighbors.length
|
117
|
+
next if count == 0
|
118
|
+
|
119
|
+
if count == 1
|
120
|
+
@equalities[neighbors[0]] = min
|
121
|
+
elsif count > 1
|
122
|
+
neighbors.each do |neighbor|
|
123
|
+
@equalities[neighbor] = min
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
self
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
data/lib/iguvium/page.rb
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Iguvium
|
4
|
+
|
5
|
+
# It's document page, you can extract tables from here. to do so, use {Iguvium::Page#extract_tables!}.
|
6
|
+
#
|
7
|
+
# {Iguvium::Page#text} method is handy in order to pre-analyze whether you need this page.
|
8
|
+
#
|
9
|
+
# @example
|
10
|
+
# pages = Iguvium.read('nixon.pdf', gspath: '/usr/bin/gs')
|
11
|
+
# pages = pages.select { |page| page.text.match?(/[Tt]able.+\d+/) }
|
12
|
+
# tables = pages.map(&:extract_tables!)
|
13
|
+
class Page
|
14
|
+
# @param page [PDF::Reader::Page]
|
15
|
+
# @param (see Iguvium.read)
|
16
|
+
# Typically you don't need it, prefer {Page} creation from {Iguvium.read}
|
17
|
+
def initialize(page, path, **opts)
|
18
|
+
@opts = opts
|
19
|
+
@reader_page = page
|
20
|
+
@path = path
|
21
|
+
end
|
22
|
+
|
23
|
+
# @!visibility private
|
24
|
+
# @return (see Iguvium::CV#lines)
|
25
|
+
attr_reader :lines
|
26
|
+
|
27
|
+
# This method does all the heavy lifting which include optical recognition of table borders.
|
28
|
+
# It returns an array of {Iguvium::Table}
|
29
|
+
# or an empty array if it fails to recognize any. To get structured data from parsed {Iguvium::Table},
|
30
|
+
# just call {Iguvium::Table#to_a}.
|
31
|
+
#
|
32
|
+
# @todo Further speed improvements should be done, expecting at least 30% speedup on multicore systems
|
33
|
+
#
|
34
|
+
# Due to the nature of PDF document which is generally a collection of independent pages,
|
35
|
+
# {Iguvium::Page#extract_tables!} is suitable for parallel processing. Concurrent processing
|
36
|
+
# (think fork as parallel vs. thread as concurrent) on the other hand would be not a great idea,
|
37
|
+
# because it's a CPU-intensive task.
|
38
|
+
#
|
39
|
+
# On some older CPUs it takes up to 2 seconds per page for it to work
|
40
|
+
# (up to 1 second on more modern ones), so use it wisely.
|
41
|
+
#
|
42
|
+
# @example extract tables using pictures as possible borders
|
43
|
+
# tables = page.extract_tables! images: true #=> [Array<Iguvium::Table>]
|
44
|
+
# @return [Array<Iguvium::Table>]
|
45
|
+
def extract_tables!(images: @opts[:images])
|
46
|
+
return @tables if @tables
|
47
|
+
|
48
|
+
@opts[:images] = images
|
49
|
+
recognize!
|
50
|
+
@tables
|
51
|
+
end
|
52
|
+
|
53
|
+
# @return [String] rendered page text, result of underlying PDF::Reader::Page#text call
|
54
|
+
# It takes ~150 ms for it to work, so it's handy
|
55
|
+
# for picking up pages before trying to extract tables, which is an expensive operation
|
56
|
+
def text
|
57
|
+
@text ||= @reader_page.text
|
58
|
+
end
|
59
|
+
|
60
|
+
# @!visibility private
|
61
|
+
# @return [Array<PDF::Reader::TextRun>] array of characters on page. Each character has its coordinates,
|
62
|
+
# size, and width
|
63
|
+
def characters
|
64
|
+
return @characters if @characters
|
65
|
+
|
66
|
+
receiver = PDF::Reader::PageTextReceiver.new
|
67
|
+
@reader_page.send(:walk, receiver)
|
68
|
+
@characters = receiver.instance_variable_get('@characters')
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
|
73
|
+
def recognize!
|
74
|
+
image = Image.read(@path, @reader_page.number, @opts)
|
75
|
+
recognized = CV.new(image).recognize
|
76
|
+
@lines = recognized[:lines]
|
77
|
+
@boxes = recognized[:boxes].reject { |box| box_empty?(box) }
|
78
|
+
@tables = @boxes.map { |box| Table.new(box, self) }.reverse
|
79
|
+
self
|
80
|
+
end
|
81
|
+
|
82
|
+
def box_empty?(box)
|
83
|
+
characters.select { |character|
|
84
|
+
box.first.cover?(character.x) && box.last.cover?(character.y)
|
85
|
+
}.empty?
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Iguvium
|
4
|
+
# Represents single table from the [Iguvium::Page]:
|
5
|
+
# * table outer borders aka box,
|
6
|
+
# * set of detected horizontal and vertical lines to form table's grid,
|
7
|
+
# * set of characters with its coordinates to fill the grid.
|
8
|
+
#
|
9
|
+
# Additional functionality like an option to detect an open table grid at the end or at the beginning
|
10
|
+
# of the page will be added later
|
11
|
+
#
|
12
|
+
# To render table into 2D text array, call {#to_a}
|
13
|
+
class Table
|
14
|
+
# @api private
|
15
|
+
def initialize(box, page)
|
16
|
+
@box = box
|
17
|
+
@lines = page.lines
|
18
|
+
@page = page
|
19
|
+
end
|
20
|
+
|
21
|
+
# Renders the table into an array of strings.
|
22
|
+
#
|
23
|
+
# Newlines in PDF have usually no semantic value, and are replaced with spaces by default.
|
24
|
+
# Sometimes you may need to keep them; in this case use `newlines: true` option.
|
25
|
+
#
|
26
|
+
# @param [Boolean] newlines keep newlines inside table cells, false by default
|
27
|
+
# @return [Array] 2D array of strings (content of table's cells)
|
28
|
+
#
|
29
|
+
def to_a(newlines: false)
|
30
|
+
grid[:rows]
|
31
|
+
.reverse
|
32
|
+
.map { |row| grid[:columns].map { |column| render(chars_inside(column, row), newlines: newlines) } }
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
attr_reader :page, :lines, :box
|
38
|
+
|
39
|
+
def enhancer(grid)
|
40
|
+
# @todo write grid enhancer to detect cells between outer grid lines and box borders
|
41
|
+
end
|
42
|
+
|
43
|
+
def characters
|
44
|
+
xrange = box.first
|
45
|
+
yrange = box.last
|
46
|
+
@characters ||=
|
47
|
+
page
|
48
|
+
.characters
|
49
|
+
.select { |character| xrange.cover?(character.x) && yrange.cover?(character.y) }
|
50
|
+
end
|
51
|
+
|
52
|
+
def grid
|
53
|
+
@grid ||=
|
54
|
+
{
|
55
|
+
rows: lines_to_ranges(lines[:horizontal]),
|
56
|
+
columns: lines_to_ranges(lines[:vertical])
|
57
|
+
}
|
58
|
+
end
|
59
|
+
|
60
|
+
def lines_to_ranges(lines)
|
61
|
+
lines.select { |line| line_in_box?(line, box) }
|
62
|
+
.map { |line| line.first.is_a?(Numeric) ? line.first : line.last }
|
63
|
+
.sort
|
64
|
+
.uniq
|
65
|
+
.each_cons(2)
|
66
|
+
.map { |a, b| a...b }
|
67
|
+
end
|
68
|
+
|
69
|
+
def line_in_box?(line, box)
|
70
|
+
line = line.map { |coord| coord.is_a?(Range) ? coord.to_a.minmax : [coord] }
|
71
|
+
(
|
72
|
+
line.first.map { |coord| box.first.cover?(coord) } +
|
73
|
+
line.last.map { |coord| box.last.cover?(coord) }
|
74
|
+
).all?
|
75
|
+
end
|
76
|
+
|
77
|
+
def chars_inside(xrange, yrange)
|
78
|
+
characters.select { |character|
|
79
|
+
xrange.cover?(character.x) && yrange.cover?(character.y)
|
80
|
+
}
|
81
|
+
end
|
82
|
+
|
83
|
+
def render(characters, newlines: false)
|
84
|
+
separator = newlines ? "\n" : ' '
|
85
|
+
characters
|
86
|
+
.sort
|
87
|
+
.chunk_while { |a, b| a.mergable?(b) }
|
88
|
+
.map { |chunk| chunk.inject(:+).to_s.strip.gsub(/[\s|\p{Z}]+/, ' ') }
|
89
|
+
.join(separator)
|
90
|
+
.gsub(/ +/, ' ')
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
data/lib/iguvium.rb
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'convolver-light'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'logger'
|
6
|
+
require 'matrix'
|
7
|
+
require 'oily_png'
|
8
|
+
require 'pdf-reader'
|
9
|
+
require 'rbconfig'
|
10
|
+
|
11
|
+
require_relative 'iguvium/labeler'
|
12
|
+
require_relative 'iguvium/cv'
|
13
|
+
require_relative 'iguvium/image'
|
14
|
+
require_relative 'iguvium/page'
|
15
|
+
require_relative 'iguvium/table'
|
16
|
+
require_relative 'iguvium/version'
|
17
|
+
|
18
|
+
# PDF tables extractor.
|
19
|
+
# @example Get all the tables in 2D text array format
|
20
|
+
# pages = Iguvium.read('filename.pdf') #=> [Array<Iguvium::Page>]
|
21
|
+
# tables = pages.flat_map { |page| page.extract_tables! } #=> [Array<Iguvium::Table>]
|
22
|
+
# tables.map(&:to_a)
|
23
|
+
# @example Get first table from the page 8
|
24
|
+
# pages = Iguvium.read('filename.pdf')
|
25
|
+
# tables = pages[7].extract_tables!
|
26
|
+
# tables.first.to_a
|
27
|
+
# For more details please look {Iguvium.read} and {Iguvium::Page#extract_tables!}
|
28
|
+
# @author Dima Ermilov <wlaer@wlaer.com>
|
29
|
+
#
|
30
|
+
module Iguvium
|
31
|
+
class << self
|
32
|
+
# It's main method. Usually this is where you start.
|
33
|
+
#
|
34
|
+
# It returns an array of {Iguvium::Page}.
|
35
|
+
#
|
36
|
+
# Tables on those pages are neither extracted nor detected yet,
|
37
|
+
# all the heavy lifting is done in {Iguvium::Page#extract_tables!} method.
|
38
|
+
#
|
39
|
+
# @param path [String] path to PDF file to be read
|
40
|
+
# @option opts [String] :gspath (nil) explicit path to the GhostScript executable. Use it in case of
|
41
|
+
# non-standard gs executable placement. If not specified, gem tries standard options
|
42
|
+
# like `C:\\Program Files\\gs\\gs*\\bin\\gswin??c.exe` on Windows or just `gs` on Mac and Linux
|
43
|
+
# @option opts [Logger::Level] :loglevel level like Logger::INFO, default is Logger::ERROR
|
44
|
+
# @return [Array <Iguvium::Page>]
|
45
|
+
#
|
46
|
+
# @example prepare pages, consider images meaningful
|
47
|
+
# pages = Iguvium.read('filename.pdf', images: true)
|
48
|
+
#
|
49
|
+
# @example set nonstandard gs path, get pages starting with the one which contains keyword
|
50
|
+
# pages = Iguvium.read('nixon.pdf', gspath: '/usr/bin/gs')
|
51
|
+
# pages = pages.drop_while { |page| !page.text.match?(/Watergate/) }
|
52
|
+
# # {Iguvium::Page#text} does not require optical page scan and thus is relatively cheap.
|
53
|
+
# # It uses an underlying PDF::Reader::Page#text which is fast but not completely free though.
|
54
|
+
#
|
55
|
+
# @option opts [Boolean] :images (false) consider pictures in PDF as possible table separators.
|
56
|
+
# This typically makes sense in a rare case when table grid in your pdf is filled with
|
57
|
+
# rasterized texture or is actually a background picture. Usually you don't want to use it.
|
58
|
+
#
|
59
|
+
def read(path, **opts)
|
60
|
+
if windows?
|
61
|
+
unless opts[:gspath]
|
62
|
+
gspath = Dir.glob('C:/Program Files/gs/gs*/bin/gswin??c.exe').first.tr('/', '\\')
|
63
|
+
opts[:gspath] = "\"#{gspath}\""
|
64
|
+
end
|
65
|
+
|
66
|
+
if opts[:gspath].empty?
|
67
|
+
puts "There's no gs utility in your $PATH.
|
68
|
+
Please install GhostScript: https://www.ghostscript.com/download/gsdnld.html"
|
69
|
+
exit
|
70
|
+
end
|
71
|
+
else
|
72
|
+
opts[:gspath] ||= gs_nix?
|
73
|
+
end
|
74
|
+
|
75
|
+
PDF::Reader.new(path, opts).pages.map { |page| Page.new(page, path, opts) }
|
76
|
+
end
|
77
|
+
|
78
|
+
# Creates and gives access to Ruby Logger. Default [Logger::Level] is Logger::ERROR.
|
79
|
+
#
|
80
|
+
# To set another level call `Iguvium.logger.level = Logger::INFO` or some other standard logger level
|
81
|
+
#
|
82
|
+
#
|
83
|
+
# It is possible to redefine Iguvium's logger, for example to replace it with a global one like
|
84
|
+
# `Iguvium.logger = Rails.logger`
|
85
|
+
# @return [Logger]
|
86
|
+
def logger
|
87
|
+
return @logger if @logger
|
88
|
+
|
89
|
+
@logger = Logger.new(STDOUT)
|
90
|
+
@logger.formatter = proc do |severity, _, _, msg|
|
91
|
+
"#{severity}: #{msg}\n"
|
92
|
+
end
|
93
|
+
@logger.level = Logger::ERROR
|
94
|
+
@logger
|
95
|
+
end
|
96
|
+
def logger=(new_logger)
|
97
|
+
@logger = new_logger
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def gs_nix?
|
103
|
+
if `which gs`.empty?
|
104
|
+
puts "There's no gs utility in your $PATH.
|
105
|
+
Please install GhostScript with `brew install ghostscript` on Mac
|
106
|
+
or download it here: https://www.ghostscript.com/download/gsdnld.html"
|
107
|
+
exit
|
108
|
+
end
|
109
|
+
'gs'
|
110
|
+
end
|
111
|
+
|
112
|
+
def windows?
|
113
|
+
RbConfig::CONFIG['host_os'].match(/mswin|mingw|cygwin/)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# TODO: 4) Add options like maybe image thresholding
|
119
|
+
#
|
120
|
+
# TODO: 6) 0.9 - version capable of reading tables with open outer cells, like this:
|
121
|
+
# __|____|_______|_____|
|
122
|
+
# __|____|_______|_____|
|
123
|
+
# __|____|_______|_____|
|
124
|
+
#
|
125
|
+
# TODO: 7) 1.0 - in addition it should deal with merged cells (move result to the upper left cell).
|
metadata
ADDED
@@ -0,0 +1,163 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: iguvium
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.8.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dima Ermilov
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-11-19 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: pdf-reader
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.1'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.1'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: convolver-light
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.3.1
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.3.1
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: oily_png
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.2'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.2'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: slop
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '4.2'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '4.2'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bundler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.16'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.16'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rake
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '10.0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '10.0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '3.0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '3.0'
|
111
|
+
description: |-
|
112
|
+
Extract tables from PDF as a structured info. Uses ghostscript to print pdf to image, \
|
113
|
+
then recognizes table separators optically. No OpenCV or other heavy dependencies
|
114
|
+
email:
|
115
|
+
- dima@scriptangle.com
|
116
|
+
executables:
|
117
|
+
- iguvium
|
118
|
+
extensions: []
|
119
|
+
extra_rdoc_files: []
|
120
|
+
files:
|
121
|
+
- ".gitignore"
|
122
|
+
- ".rspec"
|
123
|
+
- ".travis.yml"
|
124
|
+
- Gemfile
|
125
|
+
- LICENSE.txt
|
126
|
+
- README.md
|
127
|
+
- Rakefile
|
128
|
+
- bin/console
|
129
|
+
- bin/setup
|
130
|
+
- exe/iguvium
|
131
|
+
- iguvium.gemspec
|
132
|
+
- lib/iguvium.rb
|
133
|
+
- lib/iguvium/cv.rb
|
134
|
+
- lib/iguvium/image.rb
|
135
|
+
- lib/iguvium/labeler.rb
|
136
|
+
- lib/iguvium/page.rb
|
137
|
+
- lib/iguvium/table.rb
|
138
|
+
- lib/iguvium/version.rb
|
139
|
+
homepage: https://github.com/adworse/iguvium
|
140
|
+
licenses:
|
141
|
+
- MIT
|
142
|
+
metadata: {}
|
143
|
+
post_install_message:
|
144
|
+
rdoc_options: []
|
145
|
+
require_paths:
|
146
|
+
- lib
|
147
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - ">="
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '0'
|
152
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
153
|
+
requirements:
|
154
|
+
- - ">="
|
155
|
+
- !ruby/object:Gem::Version
|
156
|
+
version: '0'
|
157
|
+
requirements: []
|
158
|
+
rubyforge_project:
|
159
|
+
rubygems_version: 2.7.6
|
160
|
+
signing_key:
|
161
|
+
specification_version: 4
|
162
|
+
summary: Extract tables from PDF as a structured info
|
163
|
+
test_files: []
|