gaspar-pdf 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -1
- data/gaspar-pdf.gemspec +1 -1
- data/lib/gaspar.rb +76 -38
- data/lib/gaspar/version.rb +1 -1
- data/spec/gaspar_spec.rb +20 -13
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5d61897dcb86359510caf89940b8b4273f8df92d
|
4
|
+
data.tar.gz: 44482e7a3dd4bb9c53995ad16e49ae9a01142bc0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a50d871c11e83bb6a696bee1d75df7be959dd2979337d408d3e84c70aa303a04760c343ed255b9385044745fe760bb6889b8881f78f771e1afa5efc8c1c67a42
|
7
|
+
data.tar.gz: 880bee08cf94da0969b4ee738b051a22219b94f0c8facc7bdc34ed9444a54156eb0cf033eb623c2988655229ef2f98b66771eb6bc0d98139f3844f1bf0335c8f
|
data/README.md
CHANGED
@@ -29,13 +29,18 @@ require 'gaspar'
|
|
29
29
|
# This requires that the pdf-table-extract command is present in your PATH.
|
30
30
|
|
31
31
|
Gaspar.parse('document.pdf', 'document.html', {
|
32
|
-
|
32
|
+
format: 'table_html'
|
33
33
|
})
|
34
34
|
|
35
35
|
# Available options:
|
36
36
|
# page - page to parse
|
37
37
|
# format - the type of output: [cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list]
|
38
38
|
|
39
|
+
content = Gaspar.parse_with_content('document.pdf', 'document.html', {
|
40
|
+
format: 'table_html'
|
41
|
+
})
|
42
|
+
|
43
|
+
# you can get parsed content
|
39
44
|
```
|
40
45
|
|
41
46
|
Inspired by [Kristin](https://github.com/ricn/kristin)
|
data/gaspar-pdf.gemspec
CHANGED
@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
|
|
14
14
|
spec.license = 'MIT'
|
15
15
|
|
16
16
|
spec.files = `git ls-files`.split($RS)
|
17
|
-
|
17
|
+
|
18
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
19
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
20
20
|
spec.require_paths = ['lib']
|
data/lib/gaspar.rb
CHANGED
@@ -2,6 +2,7 @@ require 'gaspar/version'
|
|
2
2
|
require 'open-uri'
|
3
3
|
require 'net/http'
|
4
4
|
require 'spoon'
|
5
|
+
require 'pdf-reader'
|
5
6
|
|
6
7
|
# Gaspar Gem
|
7
8
|
module Gaspar
|
@@ -12,34 +13,92 @@ module Gaspar
|
|
12
13
|
@source = source
|
13
14
|
@target = target
|
14
15
|
@options = options
|
16
|
+
@extractor = extractor
|
15
17
|
end
|
16
18
|
|
17
19
|
def parse
|
20
|
+
@extractor.extract
|
21
|
+
end
|
22
|
+
|
23
|
+
def parse_with_content
|
24
|
+
@extractor.extract
|
25
|
+
@extractor.content
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def extractor
|
31
|
+
src = determine_source(@source)
|
32
|
+
pdf = Reader.new(src)
|
33
|
+
|
34
|
+
Extractor.new(
|
35
|
+
src, @target, pdf.page_count, @options
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
def random_source_name
|
40
|
+
rand(16**16).to_s(16)
|
41
|
+
end
|
42
|
+
|
43
|
+
def download_file(source)
|
44
|
+
tmp_file = "/tmp/#{random_source_name}.pdf"
|
45
|
+
File.open(tmp_file, 'wb') do |saved_file|
|
46
|
+
open(URI.encode(source), 'rb') do |read_file|
|
47
|
+
saved_file.write(read_file.read)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
tmp_file
|
52
|
+
end
|
53
|
+
|
54
|
+
def determine_source(source)
|
55
|
+
is_file = File.exist?(source) && !File.directory?(source)
|
56
|
+
is_http = URI(source).scheme == 'http'
|
57
|
+
is_https = URI(source).scheme == 'https'
|
58
|
+
|
59
|
+
unless is_file || is_http || is_https
|
60
|
+
raise IOError, "Source (#{source}) is neither a file nor an URL."
|
61
|
+
end
|
62
|
+
|
63
|
+
is_file ? source : download_file(source)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Extract data from all pages of PDF
|
68
|
+
class Extractor
|
69
|
+
def initialize(source, target, pages, options)
|
70
|
+
@source = source
|
71
|
+
@target = target
|
72
|
+
@pages = pages
|
73
|
+
@options = options
|
74
|
+
end
|
75
|
+
|
76
|
+
def extract
|
18
77
|
unless command_available?
|
19
78
|
io_error 'Can\'t find pdf-table-extract executable in PATH'
|
20
79
|
end
|
21
80
|
|
22
|
-
|
23
|
-
opts = process_options(src).split(' ')
|
81
|
+
opts = process_options.split(' ')
|
24
82
|
args = [extract_command, opts].flatten
|
25
83
|
|
26
84
|
pid = Spoon.spawnp(*args)
|
27
85
|
Process.waitpid(pid)
|
86
|
+
io_error("Could not parse #{@source}") unless $?.exitstatus.zero?
|
87
|
+
end
|
28
88
|
|
29
|
-
|
89
|
+
def content
|
90
|
+
open(@target, 'rb').read
|
30
91
|
end
|
31
92
|
|
32
93
|
private
|
33
94
|
|
34
|
-
def
|
35
|
-
raise IOError, error_message
|
36
|
-
end
|
37
|
-
|
38
|
-
def process_options(source)
|
95
|
+
def process_options
|
39
96
|
opts = []
|
40
|
-
opts.push("-i #{source}") if source
|
97
|
+
opts.push("-i #{@source}") if @source
|
41
98
|
opts.push("-o #{@target}") if @target
|
42
|
-
|
99
|
+
@pages.times do |p|
|
100
|
+
opts.push("-p #{p + 1}")
|
101
|
+
end
|
43
102
|
opts.push("-t #{@options[:format]}") if @options[:format]
|
44
103
|
|
45
104
|
opts.join(' ')
|
@@ -64,38 +123,15 @@ module Gaspar
|
|
64
123
|
nil
|
65
124
|
end
|
66
125
|
|
67
|
-
def
|
68
|
-
|
69
|
-
end
|
70
|
-
|
71
|
-
def download_file(source)
|
72
|
-
tmp_file = "/tmp/#{random_source_name}.pdf"
|
73
|
-
File.open(tmp_file, 'wb') do |saved_file|
|
74
|
-
open(URI.encode(source), 'rb') do |read_file|
|
75
|
-
saved_file.write(read_file.read)
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
tmp_file
|
80
|
-
end
|
81
|
-
|
82
|
-
def determine_source(source)
|
83
|
-
is_file = File.exist?(source) && !File.directory?(source)
|
84
|
-
is_http = URI(source).scheme == 'http'
|
85
|
-
is_https = URI(source).scheme == 'https'
|
86
|
-
|
87
|
-
unless is_file || is_http || is_https
|
88
|
-
raise IOError, "Source (#{source}) is neither a file nor an URL."
|
89
|
-
end
|
90
|
-
|
91
|
-
is_file ? source : download_file(source)
|
126
|
+
def io_error(error_message)
|
127
|
+
raise IOError, error_message
|
92
128
|
end
|
93
129
|
end
|
94
130
|
|
95
131
|
# Read infor from PDF file usin pdf-reader
|
96
132
|
class Reader
|
97
133
|
def initialize(source)
|
98
|
-
@reader = PDF::Reader.new(source)
|
134
|
+
@reader = ::PDF::Reader.new(source)
|
99
135
|
end
|
100
136
|
|
101
137
|
def metadata
|
@@ -111,9 +147,11 @@ module Gaspar
|
|
111
147
|
end
|
112
148
|
end
|
113
149
|
|
114
|
-
# options[:type]
|
115
|
-
# {cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list}
|
116
150
|
def self.parse(source, target, options = {})
|
117
151
|
Parser.new(source, target, options).parse
|
118
152
|
end
|
153
|
+
|
154
|
+
def self.parse_with_content(source, target, options = {})
|
155
|
+
Parser.new(source, target, options).parse_with_content
|
156
|
+
end
|
119
157
|
end
|
data/lib/gaspar/version.rb
CHANGED
data/spec/gaspar_spec.rb
CHANGED
@@ -2,35 +2,42 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe Gaspar do
|
4
4
|
before(:all) do
|
5
|
-
@table_pdf = file_path(
|
6
|
-
@target_path =
|
7
|
-
@table_html = @target_path +
|
5
|
+
@table_pdf = file_path('table.pdf')
|
6
|
+
@target_path = Dir.tmpdir.to_s
|
7
|
+
@table_html = @target_path + '/output.html'
|
8
8
|
end
|
9
9
|
|
10
|
-
|
11
10
|
describe '#parse' do
|
12
11
|
describe 'with wrong params' do
|
13
12
|
it 'should raise error if source file does not exists' do
|
14
|
-
|
15
|
-
|
13
|
+
expect do
|
14
|
+
c = Gaspar::Parser.new('unknown.pdf', 'unknown.html')
|
15
|
+
c.parse
|
16
|
+
end.to raise_error(IOError)
|
16
17
|
end
|
17
18
|
|
18
19
|
it 'should raise error if source is not file nor url' do
|
19
|
-
|
20
|
-
|
20
|
+
expect do
|
21
|
+
c = Gaspar::Parser.new('http:// /.pdf', 'unknown.html')
|
22
|
+
c.parse
|
23
|
+
end.to raise_error(URI::InvalidURIError)
|
21
24
|
end
|
22
25
|
end
|
23
26
|
|
24
27
|
describe 'with write params' do
|
25
|
-
it
|
26
|
-
Gaspar::Parser.new(@table_pdf, @table_html,
|
27
|
-
|
28
|
-
}).parse
|
28
|
+
it 'should be parsed' do
|
29
|
+
Gaspar::Parser.new(@table_pdf, @table_html,
|
30
|
+
format: 'table_html').parse
|
29
31
|
|
30
32
|
doc = Nokogiri::HTML(File.open(@table_html))
|
31
33
|
expect(doc.search('//comment()').text).not_to be_empty
|
32
34
|
end
|
33
|
-
end
|
34
35
|
|
36
|
+
it 'should get content' do
|
37
|
+
content = Gaspar::Parser.new(@table_pdf, @table_html,
|
38
|
+
format: 'table_html').parse_with_content
|
39
|
+
expect(content).not_to be_empty
|
40
|
+
end
|
41
|
+
end
|
35
42
|
end
|
36
43
|
end
|