iguvium 0.8.2 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b453936d07d4aea3f6eb682ba90f6c6642f83520b14462b8c46a254c6dc538f7
4
- data.tar.gz: 27fc73360191edbb1185a073d6cf4b644455af682764ff8f2cdbaeb9a5b1b973
3
+ metadata.gz: a70226f57667319ee056c909d177e04e816721aee9706f8ba6351e592cc4c209
4
+ data.tar.gz: dc3a3ccff9aee45b27224656a9a7e6b2a4d758ee0220697b145bf6ae3918e747
5
5
  SHA512:
6
- metadata.gz: de9c944ba0504a9c8bc4badcfe2bee974cc5244111999f0109939009fdbb9c167962a51a17e181763138b04909d01c1f2ccfff8a3317ebab1cd9bf1b6e26e9e6
7
- data.tar.gz: 7edeb83865b934edd01d509d0024241f6dacf727394d2b72077ff9c0e02189fdae7f1b81fcca961602ab8d2bcf42fa3fb9657bf627ce1ccdd2b2649d01ee97bf
6
+ metadata.gz: 59aa03ad81d1c9c085fb0dd3b66a48e88d44a4a8b516e83f0160a47de8bde32ae58769efa2f700276779695e9286c4eaaa6a76db432adeb1df4ac5f0be8d90fb
7
+ data.tar.gz: 58e8f45ca66aa7841207c50597d400a618e38aafa091608fd4a4db7cb7406c0473d884320e08d4cc67b26a92ba5d7c64e6aa8ef5ab1bcf4f00d9ba730eb2529a
data/README.md CHANGED
@@ -89,6 +89,7 @@ iguvium filename.pdf [options]
89
89
  -p, --pages page numbers, comma-separated, no spaces
90
90
  -i, --images use pictures in pdf (usually a bad idea)
91
91
  -n, --newlines keep newlines
92
+ --phrases keep phrases unsplit, could fix some merged cells
92
93
  -t, --text extract full page text instead of tables
93
94
  --verbose verbose output
94
95
  ```
data/exe/iguvium CHANGED
@@ -9,6 +9,7 @@ opts = Slop.parse { |o|
9
9
  o.array '-p', '--pages', 'page numbers, comma-separated, no spaces'
10
10
  o.bool '-i', '--images', 'use pictures in pdf (usually a bad idea)'
11
11
  o.bool '-n', '--newlines', 'keep newlines'
12
+ o.bool '--phrases', 'keep phrases unsplit, could fix some merged cells'
12
13
  o.bool '-t', '--text', 'extract full page text instead of tables'
13
14
  o.bool '--verbose', 'verbose output'
14
15
  o.on '--version', 'print the version' do
@@ -54,7 +55,7 @@ page_numbers.each do |number|
54
55
  tables.empty? ? puts('no tables found') : puts
55
56
 
56
57
  tables.each_with_index do |table, i|
57
- csv = table.to_a(newlines: opts[:newlines]).map(&:to_csv).join
58
+ csv = table.to_a(newlines: opts[:newlines], phrases: opts[:phrases]).map(&:to_csv).join
58
59
  next if csv.empty?
59
60
  csv_file = "#{path.gsub(/\.pdf$/, '')}_page_#{number + 1}_table_#{i}.csv"
60
61
  puts "Saving #{File.expand_path csv_file}"
data/lib/iguvium/table.rb CHANGED
@@ -24,12 +24,22 @@ module Iguvium
24
24
  # Sometimes you may need to keep them; in this case use `newlines: true` option.
25
25
  #
26
26
  # @param [Boolean] newlines keep newlines inside table cells, false by default
27
+ # @param [Boolean] phrases keep phrases unsplit, false by default.
28
+ # Poor man's merged cells workaround. Could break some tables, could fix some.
29
+ #
27
30
  # @return [Array] 2D array of strings (content of table's cells)
28
31
  #
29
- def to_a(newlines: false)
32
+ def to_a(newlines: false, phrases: false)
30
33
  grid[:rows]
31
34
  .reverse
32
- .map { |row| grid[:columns].map { |column| render(chars_inside(column, row), newlines: newlines) } }
35
+ .map { |row|
36
+ grid[:columns].map do |column|
37
+ render(
38
+ phrases ? words_inside(column, row) : chars_inside(column, row),
39
+ newlines: newlines
40
+ )
41
+ end
42
+ }
33
43
  end
34
44
 
35
45
  private
@@ -49,6 +59,20 @@ module Iguvium
49
59
  .select { |character| xrange.cover?(character.x) && yrange.cover?(character.y) }
50
60
  end
51
61
 
62
+ def words
63
+ @words ||=
64
+ characters
65
+ .sort
66
+ .chunk_while { |a, b| a.mergable?(b) }
67
+ .map { |chunk| chunk.inject(:+) }
68
+ end
69
+
70
+ def words_inside(xrange, yrange)
71
+ words.select { |character|
72
+ xrange.cover?(character.x) && yrange.cover?(character.y)
73
+ }
74
+ end
75
+
52
76
  def grid
53
77
  @grid ||=
54
78
  {
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Iguvium
4
- VERSION = '0.8.2'
4
+ VERSION = '0.8.3'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iguvium
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.2
4
+ version: 0.8.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dima Ermilov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-11-21 00:00:00.000000000 Z
11
+ date: 2018-11-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pdf-reader