iguvium 0.8.2 → 0.8.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/exe/iguvium +2 -1
- data/lib/iguvium/table.rb +26 -2
- data/lib/iguvium/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a70226f57667319ee056c909d177e04e816721aee9706f8ba6351e592cc4c209
|
4
|
+
data.tar.gz: dc3a3ccff9aee45b27224656a9a7e6b2a4d758ee0220697b145bf6ae3918e747
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 59aa03ad81d1c9c085fb0dd3b66a48e88d44a4a8b516e83f0160a47de8bde32ae58769efa2f700276779695e9286c4eaaa6a76db432adeb1df4ac5f0be8d90fb
|
7
|
+
data.tar.gz: 58e8f45ca66aa7841207c50597d400a618e38aafa091608fd4a4db7cb7406c0473d884320e08d4cc67b26a92ba5d7c64e6aa8ef5ab1bcf4f00d9ba730eb2529a
|
data/README.md
CHANGED
@@ -89,6 +89,7 @@ iguvium filename.pdf [options]
|
|
89
89
|
-p, --pages page numbers, comma-separated, no spaces
|
90
90
|
-i, --images use pictures in pdf (usually a bad idea)
|
91
91
|
-n, --newlines keep newlines
|
92
|
+
--phrases keep phrases unsplit, could fix some merged cells
|
92
93
|
-t, --text extract full page text instead of tables
|
93
94
|
--verbose verbose output
|
94
95
|
```
|
data/exe/iguvium
CHANGED
@@ -9,6 +9,7 @@ opts = Slop.parse { |o|
|
|
9
9
|
o.array '-p', '--pages', 'page numbers, comma-separated, no spaces'
|
10
10
|
o.bool '-i', '--images', 'use pictures in pdf (usually a bad idea)'
|
11
11
|
o.bool '-n', '--newlines', 'keep newlines'
|
12
|
+
o.bool '--phrases', 'keep phrases unsplit, could fix some merged cells'
|
12
13
|
o.bool '-t', '--text', 'extract full page text instead of tables'
|
13
14
|
o.bool '--verbose', 'verbose output'
|
14
15
|
o.on '--version', 'print the version' do
|
@@ -54,7 +55,7 @@ page_numbers.each do |number|
|
|
54
55
|
tables.empty? ? puts('no tables found') : puts
|
55
56
|
|
56
57
|
tables.each_with_index do |table, i|
|
57
|
-
csv = table.to_a(newlines: opts[:newlines]).map(&:to_csv).join
|
58
|
+
csv = table.to_a(newlines: opts[:newlines], phrases: opts[:phrases]).map(&:to_csv).join
|
58
59
|
next if csv.empty?
|
59
60
|
csv_file = "#{path.gsub(/\.pdf$/, '')}_page_#{number + 1}_table_#{i}.csv"
|
60
61
|
puts "Saving #{File.expand_path csv_file}"
|
data/lib/iguvium/table.rb
CHANGED
@@ -24,12 +24,22 @@ module Iguvium
|
|
24
24
|
# Sometimes you may need to keep them; in this case use `newlines: true` option.
|
25
25
|
#
|
26
26
|
# @param [Boolean] newlines keep newlines inside table cells, false by default
|
27
|
+
# @param [Boolean] phrases keep phrases unsplit, false by default.
|
28
|
+
# Poor man's merged cells workaround. Could break some tables, could fix some.
|
29
|
+
#
|
27
30
|
# @return [Array] 2D array of strings (content of table's cells)
|
28
31
|
#
|
29
|
-
def to_a(newlines: false)
|
32
|
+
def to_a(newlines: false, phrases: false)
|
30
33
|
grid[:rows]
|
31
34
|
.reverse
|
32
|
-
.map { |row|
|
35
|
+
.map { |row|
|
36
|
+
grid[:columns].map do |column|
|
37
|
+
render(
|
38
|
+
phrases ? words_inside(column, row) : chars_inside(column, row),
|
39
|
+
newlines: newlines
|
40
|
+
)
|
41
|
+
end
|
42
|
+
}
|
33
43
|
end
|
34
44
|
|
35
45
|
private
|
@@ -49,6 +59,20 @@ module Iguvium
|
|
49
59
|
.select { |character| xrange.cover?(character.x) && yrange.cover?(character.y) }
|
50
60
|
end
|
51
61
|
|
62
|
+
def words
|
63
|
+
@words ||=
|
64
|
+
characters
|
65
|
+
.sort
|
66
|
+
.chunk_while { |a, b| a.mergable?(b) }
|
67
|
+
.map { |chunk| chunk.inject(:+) }
|
68
|
+
end
|
69
|
+
|
70
|
+
def words_inside(xrange, yrange)
|
71
|
+
words.select { |character|
|
72
|
+
xrange.cover?(character.x) && yrange.cover?(character.y)
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
52
76
|
def grid
|
53
77
|
@grid ||=
|
54
78
|
{
|
data/lib/iguvium/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iguvium
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dima Ermilov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-11-
|
11
|
+
date: 2018-11-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pdf-reader
|