gaspar-pdf 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 61dc07bbed984fb395fe3e6de603f862906c853a
4
- data.tar.gz: b1130b45efc157db8a9c0ccc6a6f768d71b6e159
3
+ metadata.gz: 5d61897dcb86359510caf89940b8b4273f8df92d
4
+ data.tar.gz: 44482e7a3dd4bb9c53995ad16e49ae9a01142bc0
5
5
  SHA512:
6
- metadata.gz: 1aa7157bf22ca40cb902fa894a17ac329e52d002b6387fe9fa0850537ab205b4063ad3e6e77a919e1d4e10461c8dc02eabdee733811a4e4481930be54a6fd989
7
- data.tar.gz: b4aeb3d4ade931275b57d2924ae1639257634d98460bdfffe85c34a3e7b921d9e4dd7d18fb0f61cd8d17f0540de13875bb097ae582566ce2c49a53459ffee998
6
+ metadata.gz: a50d871c11e83bb6a696bee1d75df7be959dd2979337d408d3e84c70aa303a04760c343ed255b9385044745fe760bb6889b8881f78f771e1afa5efc8c1c67a42
7
+ data.tar.gz: 880bee08cf94da0969b4ee738b051a22219b94f0c8facc7bdc34ed9444a54156eb0cf033eb623c2988655229ef2f98b66771eb6bc0d98139f3844f1bf0335c8f
data/README.md CHANGED
@@ -29,13 +29,18 @@ require 'gaspar'
29
29
  # This requires that the pdf-table-extract command is present in your PATH.
30
30
 
31
31
  Gaspar.parse('document.pdf', 'document.html', {
32
- page: 1, format: 'table_html'
32
+ format: 'table_html'
33
33
  })
34
34
 
35
35
  # Available options:
36
36
  # page - page to parse
37
37
  # format - the type of output: [cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list]
38
38
 
39
+ content = Gaspar.parse_with_content('document.pdf', 'document.html', {
40
+ format: 'table_html'
41
+ })
42
+
43
+ # you can get parsed content
39
44
  ```
40
45
 
41
46
  Inspired by [Kristin](https://github.com/ricn/kristin)
@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
14
14
  spec.license = 'MIT'
15
15
 
16
16
  spec.files = `git ls-files`.split($RS)
17
-
17
+
18
18
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
19
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
20
20
  spec.require_paths = ['lib']
@@ -2,6 +2,7 @@ require 'gaspar/version'
2
2
  require 'open-uri'
3
3
  require 'net/http'
4
4
  require 'spoon'
5
+ require 'pdf-reader'
5
6
 
6
7
  # Gaspar Gem
7
8
  module Gaspar
@@ -12,34 +13,92 @@ module Gaspar
12
13
  @source = source
13
14
  @target = target
14
15
  @options = options
16
+ @extractor = extractor
15
17
  end
16
18
 
17
19
  def parse
20
+ @extractor.extract
21
+ end
22
+
23
+ def parse_with_content
24
+ @extractor.extract
25
+ @extractor.content
26
+ end
27
+
28
+ private
29
+
30
+ def extractor
31
+ src = determine_source(@source)
32
+ pdf = Reader.new(src)
33
+
34
+ Extractor.new(
35
+ src, @target, pdf.page_count, @options
36
+ )
37
+ end
38
+
39
+ def random_source_name
40
+ rand(16**16).to_s(16)
41
+ end
42
+
43
+ def download_file(source)
44
+ tmp_file = "/tmp/#{random_source_name}.pdf"
45
+ File.open(tmp_file, 'wb') do |saved_file|
46
+ open(URI.encode(source), 'rb') do |read_file|
47
+ saved_file.write(read_file.read)
48
+ end
49
+ end
50
+
51
+ tmp_file
52
+ end
53
+
54
+ def determine_source(source)
55
+ is_file = File.exist?(source) && !File.directory?(source)
56
+ is_http = URI(source).scheme == 'http'
57
+ is_https = URI(source).scheme == 'https'
58
+
59
+ unless is_file || is_http || is_https
60
+ raise IOError, "Source (#{source}) is neither a file nor an URL."
61
+ end
62
+
63
+ is_file ? source : download_file(source)
64
+ end
65
+ end
66
+
67
+ # Extract data from all pages of PDF
68
+ class Extractor
69
+ def initialize(source, target, pages, options)
70
+ @source = source
71
+ @target = target
72
+ @pages = pages
73
+ @options = options
74
+ end
75
+
76
+ def extract
18
77
  unless command_available?
19
78
  io_error 'Can\'t find pdf-table-extract executable in PATH'
20
79
  end
21
80
 
22
- src = determine_source(@source)
23
- opts = process_options(src).split(' ')
81
+ opts = process_options.split(' ')
24
82
  args = [extract_command, opts].flatten
25
83
 
26
84
  pid = Spoon.spawnp(*args)
27
85
  Process.waitpid(pid)
86
+ io_error("Could not parse #{@source}") unless $?.exitstatus.zero?
87
+ end
28
88
 
29
- io_error("Could not parse #{src}") unless $?.exitstatus.zero?
89
+ def content
90
+ open(@target, 'rb').read
30
91
  end
31
92
 
32
93
  private
33
94
 
34
- def io_error(error_message)
35
- raise IOError, error_message
36
- end
37
-
38
- def process_options(source)
95
+ def process_options
39
96
  opts = []
40
- opts.push("-i #{source}") if source
97
+ opts.push("-i #{@source}") if @source
41
98
  opts.push("-o #{@target}") if @target
42
- opts.push("-p #{@options[:page]}") if @options[:page]
99
+ @pages.times do |p|
100
+ opts.push("-p #{p + 1}")
101
+ end
43
102
  opts.push("-t #{@options[:format]}") if @options[:format]
44
103
 
45
104
  opts.join(' ')
@@ -64,38 +123,15 @@ module Gaspar
64
123
  nil
65
124
  end
66
125
 
67
- def random_source_name
68
- rand(16**16).to_s(16)
69
- end
70
-
71
- def download_file(source)
72
- tmp_file = "/tmp/#{random_source_name}.pdf"
73
- File.open(tmp_file, 'wb') do |saved_file|
74
- open(URI.encode(source), 'rb') do |read_file|
75
- saved_file.write(read_file.read)
76
- end
77
- end
78
-
79
- tmp_file
80
- end
81
-
82
- def determine_source(source)
83
- is_file = File.exist?(source) && !File.directory?(source)
84
- is_http = URI(source).scheme == 'http'
85
- is_https = URI(source).scheme == 'https'
86
-
87
- unless is_file || is_http || is_https
88
- raise IOError, "Source (#{source}) is neither a file nor an URL."
89
- end
90
-
91
- is_file ? source : download_file(source)
126
+ def io_error(error_message)
127
+ raise IOError, error_message
92
128
  end
93
129
  end
94
130
 
95
131
  # Read infor from PDF file usin pdf-reader
96
132
  class Reader
97
133
  def initialize(source)
98
- @reader = PDF::Reader.new(source)
134
+ @reader = ::PDF::Reader.new(source)
99
135
  end
100
136
 
101
137
  def metadata
@@ -111,9 +147,11 @@ module Gaspar
111
147
  end
112
148
  end
113
149
 
114
- # options[:type]
115
- # {cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list}
116
150
  def self.parse(source, target, options = {})
117
151
  Parser.new(source, target, options).parse
118
152
  end
153
+
154
+ def self.parse_with_content(source, target, options = {})
155
+ Parser.new(source, target, options).parse_with_content
156
+ end
119
157
  end
@@ -1,3 +1,3 @@
1
1
  module Gaspar
2
- VERSION = '0.0.1'
2
+ VERSION = '0.0.2'
3
3
  end
@@ -2,35 +2,42 @@ require 'spec_helper'
2
2
 
3
3
  describe Gaspar do
4
4
  before(:all) do
5
- @table_pdf = file_path("table.pdf")
6
- @target_path = "#{Dir::tmpdir}"
7
- @table_html = @target_path + "/output.html"
5
+ @table_pdf = file_path('table.pdf')
6
+ @target_path = Dir.tmpdir.to_s
7
+ @table_html = @target_path + '/output.html'
8
8
  end
9
9
 
10
-
11
10
  describe '#parse' do
12
11
  describe 'with wrong params' do
13
12
  it 'should raise error if source file does not exists' do
14
- c = Gaspar::Parser.new('unknown.pdf', 'unknown.html')
15
- expect { c.parse }.to raise_error(IOError)
13
+ expect do
14
+ c = Gaspar::Parser.new('unknown.pdf', 'unknown.html')
15
+ c.parse
16
+ end.to raise_error(IOError)
16
17
  end
17
18
 
18
19
  it 'should raise error if source is not file nor url' do
19
- c = Gaspar::Parser.new('http:// /.pdf', 'unknown.html')
20
- expect { c.parse }.to raise_error(URI::InvalidURIError)
20
+ expect do
21
+ c = Gaspar::Parser.new('http:// /.pdf', 'unknown.html')
22
+ c.parse
23
+ end.to raise_error(URI::InvalidURIError)
21
24
  end
22
25
  end
23
26
 
24
27
  describe 'with write params' do
25
- it "should be possible to specify one page" do
26
- Gaspar::Parser.new(@table_pdf, @table_html, {
27
- page: 2, format: 'table_html'
28
- }).parse
28
+ it 'should be parsed' do
29
+ Gaspar::Parser.new(@table_pdf, @table_html,
30
+ format: 'table_html').parse
29
31
 
30
32
  doc = Nokogiri::HTML(File.open(@table_html))
31
33
  expect(doc.search('//comment()').text).not_to be_empty
32
34
  end
33
- end
34
35
 
36
+ it 'should get content' do
37
+ content = Gaspar::Parser.new(@table_pdf, @table_html,
38
+ format: 'table_html').parse_with_content
39
+ expect(content).not_to be_empty
40
+ end
41
+ end
35
42
  end
36
43
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gaspar-pdf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - 5rabbits