iguvium 0.8.2 → 0.8.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b453936d07d4aea3f6eb682ba90f6c6642f83520b14462b8c46a254c6dc538f7
4
- data.tar.gz: 27fc73360191edbb1185a073d6cf4b644455af682764ff8f2cdbaeb9a5b1b973
3
+ metadata.gz: a70226f57667319ee056c909d177e04e816721aee9706f8ba6351e592cc4c209
4
+ data.tar.gz: dc3a3ccff9aee45b27224656a9a7e6b2a4d758ee0220697b145bf6ae3918e747
5
5
  SHA512:
6
- metadata.gz: de9c944ba0504a9c8bc4badcfe2bee974cc5244111999f0109939009fdbb9c167962a51a17e181763138b04909d01c1f2ccfff8a3317ebab1cd9bf1b6e26e9e6
7
- data.tar.gz: 7edeb83865b934edd01d509d0024241f6dacf727394d2b72077ff9c0e02189fdae7f1b81fcca961602ab8d2bcf42fa3fb9657bf627ce1ccdd2b2649d01ee97bf
6
+ metadata.gz: 59aa03ad81d1c9c085fb0dd3b66a48e88d44a4a8b516e83f0160a47de8bde32ae58769efa2f700276779695e9286c4eaaa6a76db432adeb1df4ac5f0be8d90fb
7
+ data.tar.gz: 58e8f45ca66aa7841207c50597d400a618e38aafa091608fd4a4db7cb7406c0473d884320e08d4cc67b26a92ba5d7c64e6aa8ef5ab1bcf4f00d9ba730eb2529a
data/README.md CHANGED
@@ -89,6 +89,7 @@ iguvium filename.pdf [options]
89
89
  -p, --pages page numbers, comma-separated, no spaces
90
90
  -i, --images use pictures in pdf (usually a bad idea)
91
91
  -n, --newlines keep newlines
92
+ --phrases keep phrases unsplit, could fix some merged cells
92
93
  -t, --text extract full page text instead of tables
93
94
  --verbose verbose output
94
95
  ```
data/exe/iguvium CHANGED
@@ -9,6 +9,7 @@ opts = Slop.parse { |o|
9
9
  o.array '-p', '--pages', 'page numbers, comma-separated, no spaces'
10
10
  o.bool '-i', '--images', 'use pictures in pdf (usually a bad idea)'
11
11
  o.bool '-n', '--newlines', 'keep newlines'
12
+ o.bool '--phrases', 'keep phrases unsplit, could fix some merged cells'
12
13
  o.bool '-t', '--text', 'extract full page text instead of tables'
13
14
  o.bool '--verbose', 'verbose output'
14
15
  o.on '--version', 'print the version' do
@@ -54,7 +55,7 @@ page_numbers.each do |number|
54
55
  tables.empty? ? puts('no tables found') : puts
55
56
 
56
57
  tables.each_with_index do |table, i|
57
- csv = table.to_a(newlines: opts[:newlines]).map(&:to_csv).join
58
+ csv = table.to_a(newlines: opts[:newlines], phrases: opts[:phrases]).map(&:to_csv).join
58
59
  next if csv.empty?
59
60
  csv_file = "#{path.gsub(/\.pdf$/, '')}_page_#{number + 1}_table_#{i}.csv"
60
61
  puts "Saving #{File.expand_path csv_file}"
data/lib/iguvium/table.rb CHANGED
@@ -24,12 +24,22 @@ module Iguvium
24
24
  # Sometimes you may need to keep them; in this case use `newlines: true` option.
25
25
  #
26
26
  # @param [Boolean] newlines keep newlines inside table cells, false by default
27
+ # @param [Boolean] phrases keep phrases unsplit, false by default.
28
+ # Poor man's merged cells workaround. Could break some tables, could fix some.
29
+ #
27
30
  # @return [Array] 2D array of strings (content of table's cells)
28
31
  #
29
- def to_a(newlines: false)
32
+ def to_a(newlines: false, phrases: false)
30
33
  grid[:rows]
31
34
  .reverse
32
- .map { |row| grid[:columns].map { |column| render(chars_inside(column, row), newlines: newlines) } }
35
+ .map { |row|
36
+ grid[:columns].map do |column|
37
+ render(
38
+ phrases ? words_inside(column, row) : chars_inside(column, row),
39
+ newlines: newlines
40
+ )
41
+ end
42
+ }
33
43
  end
34
44
 
35
45
  private
@@ -49,6 +59,20 @@ module Iguvium
49
59
  .select { |character| xrange.cover?(character.x) && yrange.cover?(character.y) }
50
60
  end
51
61
 
62
+ def words
63
+ @words ||=
64
+ characters
65
+ .sort
66
+ .chunk_while { |a, b| a.mergable?(b) }
67
+ .map { |chunk| chunk.inject(:+) }
68
+ end
69
+
70
+ def words_inside(xrange, yrange)
71
+ words.select { |character|
72
+ xrange.cover?(character.x) && yrange.cover?(character.y)
73
+ }
74
+ end
75
+
52
76
  def grid
53
77
  @grid ||=
54
78
  {
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Iguvium
4
- VERSION = '0.8.2'
4
+ VERSION = '0.8.3'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iguvium
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.2
4
+ version: 0.8.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dima Ermilov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-11-21 00:00:00.000000000 Z
11
+ date: 2018-11-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pdf-reader