iguvium 0.8.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +7 -1
- data/exe/iguvium +13 -1
- data/lib/iguvium/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f3c9f4727d6069408bb795a4754c1b3f7098a997b7b6b47a9e114c4c04602f17
|
4
|
+
data.tar.gz: f7490d4f57e2e845740713cddff88fd558070f7ab4481598bf08fc504c33802c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b973456ea806aef36d4b8328ae2740f624e31bf169d4861d0c6e095133a2fe33e41780e11830c4f891b2d72204873e20225575130e59f5ab55fe6e496a9c0c45
|
7
|
+
data.tar.gz: 218edd15c127f9d705759d1f32dbedbcd68615d1c4d77d4ca8e63b0d0d12801c759444cc4d4967c81e3cc4d88b4a90fb76792794d69b29a25395561bc78eaf0e
|
data/README.md
CHANGED
@@ -60,6 +60,10 @@ And then execute:
|
|
60
60
|
Or install it yourself as:
|
61
61
|
|
62
62
|
$ gem install iguvium
|
63
|
+
|
64
|
+
If you're not a developer and have a Mac, you maybe have default Ruby installation and no development tools installed.
|
65
|
+
|
66
|
+
In this case, run `xcode-select --install` beforehand, and after that install Iguvium as admin: `sudo gem install iguvium`
|
63
67
|
|
64
68
|
## Usage
|
65
69
|
|
@@ -85,10 +89,12 @@ iguvium filename.pdf [options]
|
|
85
89
|
-p, --pages page numbers, comma-separated, no spaces
|
86
90
|
-i, --images use pictures in pdf (usually a bad idea)
|
87
91
|
-n, --newlines keep newlines
|
92
|
+
-t, --text extract full page text instead of tables
|
88
93
|
--verbose verbose output
|
89
94
|
```
|
90
95
|
|
91
|
-
Given a filename, it generates CSV files for the tables detected
|
96
|
+
Given a filename, it generates CSV files for the tables detected or, with `-t` option,
|
97
|
+
just page text. The latter is useful in case of whitespace-separated fixed-width tables.
|
92
98
|
|
93
99
|
## Implementation details
|
94
100
|
There are usually no actual tables in PDFs, only characters with coordinates,
|
data/exe/iguvium
CHANGED
@@ -9,6 +9,7 @@ opts = Slop.parse { |o|
|
|
9
9
|
o.array '-p', '--pages', 'page numbers, comma-separated, no spaces'
|
10
10
|
o.bool '-i', '--images', 'use pictures in pdf (usually a bad idea)'
|
11
11
|
o.bool '-n', '--newlines', 'keep newlines'
|
12
|
+
o.bool '-t', '--text', 'extract full page text instead of tables'
|
12
13
|
o.bool '--verbose', 'verbose output'
|
13
14
|
o.on '--version', 'print the version' do
|
14
15
|
puts Iguvium::VERSION
|
@@ -36,6 +37,17 @@ page_numbers = pages.count.times.to_a if page_numbers.empty?
|
|
36
37
|
# puts page_numbers.inspect
|
37
38
|
# puts opts.to_hash.inspect
|
38
39
|
|
40
|
+
if opts[:text]
|
41
|
+
page_numbers.each do |number|
|
42
|
+
print "Extracting page #{number + 1}... "
|
43
|
+
txt = pages[number].text
|
44
|
+
txt_file = "#{path.gsub(/\.pdf$/, '')}_page_#{number + 1}.txt"
|
45
|
+
puts "Saving #{File.expand_path txt_file}"
|
46
|
+
File.write txt_file, txt
|
47
|
+
end
|
48
|
+
exit
|
49
|
+
end
|
50
|
+
|
39
51
|
page_numbers.each do |number|
|
40
52
|
print "Extracting page #{number + 1}... "
|
41
53
|
tables = pages[number].extract_tables!(images: opts[:images])
|
@@ -45,7 +57,7 @@ page_numbers.each do |number|
|
|
45
57
|
csv = table.to_a(newlines: opts[:newlines]).map(&:to_csv).join
|
46
58
|
next if csv.empty?
|
47
59
|
csv_file = "#{path.gsub(/\.pdf$/, '')}_page_#{number + 1}_table_#{i}.csv"
|
48
|
-
puts "Saving #{File.expand_path
|
60
|
+
puts "Saving #{File.expand_path csv_file}"
|
49
61
|
File.write csv_file, csv
|
50
62
|
end
|
51
63
|
end
|
data/lib/iguvium/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iguvium
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dima Ermilov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-11-
|
11
|
+
date: 2018-11-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pdf-reader
|