gaspar-pdf 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -1
- data/gaspar-pdf.gemspec +1 -1
- data/lib/gaspar.rb +76 -38
- data/lib/gaspar/version.rb +1 -1
- data/spec/gaspar_spec.rb +20 -13
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5d61897dcb86359510caf89940b8b4273f8df92d
|
4
|
+
data.tar.gz: 44482e7a3dd4bb9c53995ad16e49ae9a01142bc0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a50d871c11e83bb6a696bee1d75df7be959dd2979337d408d3e84c70aa303a04760c343ed255b9385044745fe760bb6889b8881f78f771e1afa5efc8c1c67a42
|
7
|
+
data.tar.gz: 880bee08cf94da0969b4ee738b051a22219b94f0c8facc7bdc34ed9444a54156eb0cf033eb623c2988655229ef2f98b66771eb6bc0d98139f3844f1bf0335c8f
|
data/README.md
CHANGED
@@ -29,13 +29,18 @@ require 'gaspar'
|
|
29
29
|
# This requires that the pdf-table-extract command is present in your PATH.
|
30
30
|
|
31
31
|
Gaspar.parse('document.pdf', 'document.html', {
|
32
|
-
|
32
|
+
format: 'table_html'
|
33
33
|
})
|
34
34
|
|
35
35
|
# Available options:
|
36
36
|
# page - page to parse
|
37
37
|
# format - the type of output: [cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list]
|
38
38
|
|
39
|
+
content = Gaspar.parse_with_content('document.pdf', 'document.html', {
|
40
|
+
format: 'table_html'
|
41
|
+
})
|
42
|
+
|
43
|
+
# you can get parsed content
|
39
44
|
```
|
40
45
|
|
41
46
|
Inspired by [Kristin](https://github.com/ricn/kristin)
|
data/gaspar-pdf.gemspec
CHANGED
@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
|
|
14
14
|
spec.license = 'MIT'
|
15
15
|
|
16
16
|
spec.files = `git ls-files`.split($RS)
|
17
|
-
|
17
|
+
|
18
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
19
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
20
20
|
spec.require_paths = ['lib']
|
data/lib/gaspar.rb
CHANGED
@@ -2,6 +2,7 @@ require 'gaspar/version'
|
|
2
2
|
require 'open-uri'
|
3
3
|
require 'net/http'
|
4
4
|
require 'spoon'
|
5
|
+
require 'pdf-reader'
|
5
6
|
|
6
7
|
# Gaspar Gem
|
7
8
|
module Gaspar
|
@@ -12,34 +13,92 @@ module Gaspar
|
|
12
13
|
@source = source
|
13
14
|
@target = target
|
14
15
|
@options = options
|
16
|
+
@extractor = extractor
|
15
17
|
end
|
16
18
|
|
17
19
|
def parse
|
20
|
+
@extractor.extract
|
21
|
+
end
|
22
|
+
|
23
|
+
def parse_with_content
|
24
|
+
@extractor.extract
|
25
|
+
@extractor.content
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def extractor
|
31
|
+
src = determine_source(@source)
|
32
|
+
pdf = Reader.new(src)
|
33
|
+
|
34
|
+
Extractor.new(
|
35
|
+
src, @target, pdf.page_count, @options
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
def random_source_name
|
40
|
+
rand(16**16).to_s(16)
|
41
|
+
end
|
42
|
+
|
43
|
+
def download_file(source)
|
44
|
+
tmp_file = "/tmp/#{random_source_name}.pdf"
|
45
|
+
File.open(tmp_file, 'wb') do |saved_file|
|
46
|
+
open(URI.encode(source), 'rb') do |read_file|
|
47
|
+
saved_file.write(read_file.read)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
tmp_file
|
52
|
+
end
|
53
|
+
|
54
|
+
def determine_source(source)
|
55
|
+
is_file = File.exist?(source) && !File.directory?(source)
|
56
|
+
is_http = URI(source).scheme == 'http'
|
57
|
+
is_https = URI(source).scheme == 'https'
|
58
|
+
|
59
|
+
unless is_file || is_http || is_https
|
60
|
+
raise IOError, "Source (#{source}) is neither a file nor an URL."
|
61
|
+
end
|
62
|
+
|
63
|
+
is_file ? source : download_file(source)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Extract data from all pages of PDF
|
68
|
+
class Extractor
|
69
|
+
def initialize(source, target, pages, options)
|
70
|
+
@source = source
|
71
|
+
@target = target
|
72
|
+
@pages = pages
|
73
|
+
@options = options
|
74
|
+
end
|
75
|
+
|
76
|
+
def extract
|
18
77
|
unless command_available?
|
19
78
|
io_error 'Can\'t find pdf-table-extract executable in PATH'
|
20
79
|
end
|
21
80
|
|
22
|
-
|
23
|
-
opts = process_options(src).split(' ')
|
81
|
+
opts = process_options.split(' ')
|
24
82
|
args = [extract_command, opts].flatten
|
25
83
|
|
26
84
|
pid = Spoon.spawnp(*args)
|
27
85
|
Process.waitpid(pid)
|
86
|
+
io_error("Could not parse #{@source}") unless $?.exitstatus.zero?
|
87
|
+
end
|
28
88
|
|
29
|
-
|
89
|
+
def content
|
90
|
+
open(@target, 'rb').read
|
30
91
|
end
|
31
92
|
|
32
93
|
private
|
33
94
|
|
34
|
-
def
|
35
|
-
raise IOError, error_message
|
36
|
-
end
|
37
|
-
|
38
|
-
def process_options(source)
|
95
|
+
def process_options
|
39
96
|
opts = []
|
40
|
-
opts.push("-i #{source}") if source
|
97
|
+
opts.push("-i #{@source}") if @source
|
41
98
|
opts.push("-o #{@target}") if @target
|
42
|
-
|
99
|
+
@pages.times do |p|
|
100
|
+
opts.push("-p #{p + 1}")
|
101
|
+
end
|
43
102
|
opts.push("-t #{@options[:format]}") if @options[:format]
|
44
103
|
|
45
104
|
opts.join(' ')
|
@@ -64,38 +123,15 @@ module Gaspar
|
|
64
123
|
nil
|
65
124
|
end
|
66
125
|
|
67
|
-
def
|
68
|
-
|
69
|
-
end
|
70
|
-
|
71
|
-
def download_file(source)
|
72
|
-
tmp_file = "/tmp/#{random_source_name}.pdf"
|
73
|
-
File.open(tmp_file, 'wb') do |saved_file|
|
74
|
-
open(URI.encode(source), 'rb') do |read_file|
|
75
|
-
saved_file.write(read_file.read)
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
tmp_file
|
80
|
-
end
|
81
|
-
|
82
|
-
def determine_source(source)
|
83
|
-
is_file = File.exist?(source) && !File.directory?(source)
|
84
|
-
is_http = URI(source).scheme == 'http'
|
85
|
-
is_https = URI(source).scheme == 'https'
|
86
|
-
|
87
|
-
unless is_file || is_http || is_https
|
88
|
-
raise IOError, "Source (#{source}) is neither a file nor an URL."
|
89
|
-
end
|
90
|
-
|
91
|
-
is_file ? source : download_file(source)
|
126
|
+
def io_error(error_message)
|
127
|
+
raise IOError, error_message
|
92
128
|
end
|
93
129
|
end
|
94
130
|
|
95
131
|
# Read infor from PDF file usin pdf-reader
|
96
132
|
class Reader
|
97
133
|
def initialize(source)
|
98
|
-
@reader = PDF::Reader.new(source)
|
134
|
+
@reader = ::PDF::Reader.new(source)
|
99
135
|
end
|
100
136
|
|
101
137
|
def metadata
|
@@ -111,9 +147,11 @@ module Gaspar
|
|
111
147
|
end
|
112
148
|
end
|
113
149
|
|
114
|
-
# options[:type]
|
115
|
-
# {cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list}
|
116
150
|
def self.parse(source, target, options = {})
|
117
151
|
Parser.new(source, target, options).parse
|
118
152
|
end
|
153
|
+
|
154
|
+
def self.parse_with_content(source, target, options = {})
|
155
|
+
Parser.new(source, target, options).parse_with_content
|
156
|
+
end
|
119
157
|
end
|
data/lib/gaspar/version.rb
CHANGED
data/spec/gaspar_spec.rb
CHANGED
@@ -2,35 +2,42 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe Gaspar do
|
4
4
|
before(:all) do
|
5
|
-
@table_pdf = file_path(
|
6
|
-
@target_path =
|
7
|
-
@table_html = @target_path +
|
5
|
+
@table_pdf = file_path('table.pdf')
|
6
|
+
@target_path = Dir.tmpdir.to_s
|
7
|
+
@table_html = @target_path + '/output.html'
|
8
8
|
end
|
9
9
|
|
10
|
-
|
11
10
|
describe '#parse' do
|
12
11
|
describe 'with wrong params' do
|
13
12
|
it 'should raise error if source file does not exists' do
|
14
|
-
|
15
|
-
|
13
|
+
expect do
|
14
|
+
c = Gaspar::Parser.new('unknown.pdf', 'unknown.html')
|
15
|
+
c.parse
|
16
|
+
end.to raise_error(IOError)
|
16
17
|
end
|
17
18
|
|
18
19
|
it 'should raise error if source is not file nor url' do
|
19
|
-
|
20
|
-
|
20
|
+
expect do
|
21
|
+
c = Gaspar::Parser.new('http:// /.pdf', 'unknown.html')
|
22
|
+
c.parse
|
23
|
+
end.to raise_error(URI::InvalidURIError)
|
21
24
|
end
|
22
25
|
end
|
23
26
|
|
24
27
|
describe 'with write params' do
|
25
|
-
it
|
26
|
-
Gaspar::Parser.new(@table_pdf, @table_html,
|
27
|
-
|
28
|
-
}).parse
|
28
|
+
it 'should be parsed' do
|
29
|
+
Gaspar::Parser.new(@table_pdf, @table_html,
|
30
|
+
format: 'table_html').parse
|
29
31
|
|
30
32
|
doc = Nokogiri::HTML(File.open(@table_html))
|
31
33
|
expect(doc.search('//comment()').text).not_to be_empty
|
32
34
|
end
|
33
|
-
end
|
34
35
|
|
36
|
+
it 'should get content' do
|
37
|
+
content = Gaspar::Parser.new(@table_pdf, @table_html,
|
38
|
+
format: 'table_html').parse_with_content
|
39
|
+
expect(content).not_to be_empty
|
40
|
+
end
|
41
|
+
end
|
35
42
|
end
|
36
43
|
end
|