iguvium 0.8.2 → 0.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/exe/iguvium +2 -1
- data/lib/iguvium/table.rb +26 -2
- data/lib/iguvium/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a70226f57667319ee056c909d177e04e816721aee9706f8ba6351e592cc4c209
|
4
|
+
data.tar.gz: dc3a3ccff9aee45b27224656a9a7e6b2a4d758ee0220697b145bf6ae3918e747
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 59aa03ad81d1c9c085fb0dd3b66a48e88d44a4a8b516e83f0160a47de8bde32ae58769efa2f700276779695e9286c4eaaa6a76db432adeb1df4ac5f0be8d90fb
|
7
|
+
data.tar.gz: 58e8f45ca66aa7841207c50597d400a618e38aafa091608fd4a4db7cb7406c0473d884320e08d4cc67b26a92ba5d7c64e6aa8ef5ab1bcf4f00d9ba730eb2529a
|
data/README.md
CHANGED
@@ -89,6 +89,7 @@ iguvium filename.pdf [options]
|
|
89
89
|
-p, --pages page numbers, comma-separated, no spaces
|
90
90
|
-i, --images use pictures in pdf (usually a bad idea)
|
91
91
|
-n, --newlines keep newlines
|
92
|
+
--phrases keep phrases unsplit, could fix some merged cells
|
92
93
|
-t, --text extract full page text instead of tables
|
93
94
|
--verbose verbose output
|
94
95
|
```
|
data/exe/iguvium
CHANGED
@@ -9,6 +9,7 @@ opts = Slop.parse { |o|
|
|
9
9
|
o.array '-p', '--pages', 'page numbers, comma-separated, no spaces'
|
10
10
|
o.bool '-i', '--images', 'use pictures in pdf (usually a bad idea)'
|
11
11
|
o.bool '-n', '--newlines', 'keep newlines'
|
12
|
+
o.bool '--phrases', 'keep phrases unsplit, could fix some merged cells'
|
12
13
|
o.bool '-t', '--text', 'extract full page text instead of tables'
|
13
14
|
o.bool '--verbose', 'verbose output'
|
14
15
|
o.on '--version', 'print the version' do
|
@@ -54,7 +55,7 @@ page_numbers.each do |number|
|
|
54
55
|
tables.empty? ? puts('no tables found') : puts
|
55
56
|
|
56
57
|
tables.each_with_index do |table, i|
|
57
|
-
csv = table.to_a(newlines: opts[:newlines]).map(&:to_csv).join
|
58
|
+
csv = table.to_a(newlines: opts[:newlines], phrases: opts[:phrases]).map(&:to_csv).join
|
58
59
|
next if csv.empty?
|
59
60
|
csv_file = "#{path.gsub(/\.pdf$/, '')}_page_#{number + 1}_table_#{i}.csv"
|
60
61
|
puts "Saving #{File.expand_path csv_file}"
|
data/lib/iguvium/table.rb
CHANGED
@@ -24,12 +24,22 @@ module Iguvium
|
|
24
24
|
# Sometimes you may need to keep them; in this case use `newlines: true` option.
|
25
25
|
#
|
26
26
|
# @param [Boolean] newlines keep newlines inside table cells, false by default
|
27
|
+
# @param [Boolean] phrases keep phrases unsplit, false by default.
|
28
|
+
# Poor man's merged cells workaround. Could break some tables, could fix some.
|
29
|
+
#
|
27
30
|
# @return [Array] 2D array of strings (content of table's cells)
|
28
31
|
#
|
29
|
-
def to_a(newlines: false)
|
32
|
+
def to_a(newlines: false, phrases: false)
|
30
33
|
grid[:rows]
|
31
34
|
.reverse
|
32
|
-
.map { |row|
|
35
|
+
.map { |row|
|
36
|
+
grid[:columns].map do |column|
|
37
|
+
render(
|
38
|
+
phrases ? words_inside(column, row) : chars_inside(column, row),
|
39
|
+
newlines: newlines
|
40
|
+
)
|
41
|
+
end
|
42
|
+
}
|
33
43
|
end
|
34
44
|
|
35
45
|
private
|
@@ -49,6 +59,20 @@ module Iguvium
|
|
49
59
|
.select { |character| xrange.cover?(character.x) && yrange.cover?(character.y) }
|
50
60
|
end
|
51
61
|
|
62
|
+
def words
|
63
|
+
@words ||=
|
64
|
+
characters
|
65
|
+
.sort
|
66
|
+
.chunk_while { |a, b| a.mergable?(b) }
|
67
|
+
.map { |chunk| chunk.inject(:+) }
|
68
|
+
end
|
69
|
+
|
70
|
+
def words_inside(xrange, yrange)
|
71
|
+
words.select { |character|
|
72
|
+
xrange.cover?(character.x) && yrange.cover?(character.y)
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
52
76
|
def grid
|
53
77
|
@grid ||=
|
54
78
|
{
|
data/lib/iguvium/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iguvium
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dima Ermilov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-11-
|
11
|
+
date: 2018-11-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pdf-reader
|