gaspar-pdf 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 61dc07bbed984fb395fe3e6de603f862906c853a
4
- data.tar.gz: b1130b45efc157db8a9c0ccc6a6f768d71b6e159
3
+ metadata.gz: 5d61897dcb86359510caf89940b8b4273f8df92d
4
+ data.tar.gz: 44482e7a3dd4bb9c53995ad16e49ae9a01142bc0
5
5
  SHA512:
6
- metadata.gz: 1aa7157bf22ca40cb902fa894a17ac329e52d002b6387fe9fa0850537ab205b4063ad3e6e77a919e1d4e10461c8dc02eabdee733811a4e4481930be54a6fd989
7
- data.tar.gz: b4aeb3d4ade931275b57d2924ae1639257634d98460bdfffe85c34a3e7b921d9e4dd7d18fb0f61cd8d17f0540de13875bb097ae582566ce2c49a53459ffee998
6
+ metadata.gz: a50d871c11e83bb6a696bee1d75df7be959dd2979337d408d3e84c70aa303a04760c343ed255b9385044745fe760bb6889b8881f78f771e1afa5efc8c1c67a42
7
+ data.tar.gz: 880bee08cf94da0969b4ee738b051a22219b94f0c8facc7bdc34ed9444a54156eb0cf033eb623c2988655229ef2f98b66771eb6bc0d98139f3844f1bf0335c8f
data/README.md CHANGED
@@ -29,13 +29,18 @@ require 'gaspar'
29
29
  # This requires that the pdf-table-extract command is present in your PATH.
30
30
 
31
31
  Gaspar.parse('document.pdf', 'document.html', {
32
- page: 1, format: 'table_html'
32
+ format: 'table_html'
33
33
  })
34
34
 
35
35
  # Available options:
36
36
  # page - page to parse
37
37
  # format - the type of output: [cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list]
38
38
 
39
+ content = Gaspar.parse_with_content('document.pdf', 'document.html', {
40
+ format: 'table_html'
41
+ })
42
+
43
+ # you can get parsed content
39
44
  ```
40
45
 
41
46
  Inspired by [Kristin](https://github.com/ricn/kristin)
@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
14
14
  spec.license = 'MIT'
15
15
 
16
16
  spec.files = `git ls-files`.split($RS)
17
-
17
+
18
18
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
19
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
20
20
  spec.require_paths = ['lib']
@@ -2,6 +2,7 @@ require 'gaspar/version'
2
2
  require 'open-uri'
3
3
  require 'net/http'
4
4
  require 'spoon'
5
+ require 'pdf-reader'
5
6
 
6
7
  # Gaspar Gem
7
8
  module Gaspar
@@ -12,34 +13,92 @@ module Gaspar
12
13
  @source = source
13
14
  @target = target
14
15
  @options = options
16
+ @extractor = extractor
15
17
  end
16
18
 
17
19
  def parse
20
+ @extractor.extract
21
+ end
22
+
23
+ def parse_with_content
24
+ @extractor.extract
25
+ @extractor.content
26
+ end
27
+
28
+ private
29
+
30
+ def extractor
31
+ src = determine_source(@source)
32
+ pdf = Reader.new(src)
33
+
34
+ Extractor.new(
35
+ src, @target, pdf.page_count, @options
36
+ )
37
+ end
38
+
39
+ def random_source_name
40
+ rand(16**16).to_s(16)
41
+ end
42
+
43
+ def download_file(source)
44
+ tmp_file = "/tmp/#{random_source_name}.pdf"
45
+ File.open(tmp_file, 'wb') do |saved_file|
46
+ open(URI.encode(source), 'rb') do |read_file|
47
+ saved_file.write(read_file.read)
48
+ end
49
+ end
50
+
51
+ tmp_file
52
+ end
53
+
54
+ def determine_source(source)
55
+ is_file = File.exist?(source) && !File.directory?(source)
56
+ is_http = URI(source).scheme == 'http'
57
+ is_https = URI(source).scheme == 'https'
58
+
59
+ unless is_file || is_http || is_https
60
+ raise IOError, "Source (#{source}) is neither a file nor an URL."
61
+ end
62
+
63
+ is_file ? source : download_file(source)
64
+ end
65
+ end
66
+
67
+ # Extract data from all pages of PDF
68
+ class Extractor
69
+ def initialize(source, target, pages, options)
70
+ @source = source
71
+ @target = target
72
+ @pages = pages
73
+ @options = options
74
+ end
75
+
76
+ def extract
18
77
  unless command_available?
19
78
  io_error 'Can\'t find pdf-table-extract executable in PATH'
20
79
  end
21
80
 
22
- src = determine_source(@source)
23
- opts = process_options(src).split(' ')
81
+ opts = process_options.split(' ')
24
82
  args = [extract_command, opts].flatten
25
83
 
26
84
  pid = Spoon.spawnp(*args)
27
85
  Process.waitpid(pid)
86
+ io_error("Could not parse #{@source}") unless $?.exitstatus.zero?
87
+ end
28
88
 
29
- io_error("Could not parse #{src}") unless $?.exitstatus.zero?
89
+ def content
90
+ open(@target, 'rb').read
30
91
  end
31
92
 
32
93
  private
33
94
 
34
- def io_error(error_message)
35
- raise IOError, error_message
36
- end
37
-
38
- def process_options(source)
95
+ def process_options
39
96
  opts = []
40
- opts.push("-i #{source}") if source
97
+ opts.push("-i #{@source}") if @source
41
98
  opts.push("-o #{@target}") if @target
42
- opts.push("-p #{@options[:page]}") if @options[:page]
99
+ @pages.times do |p|
100
+ opts.push("-p #{p + 1}")
101
+ end
43
102
  opts.push("-t #{@options[:format]}") if @options[:format]
44
103
 
45
104
  opts.join(' ')
@@ -64,38 +123,15 @@ module Gaspar
64
123
  nil
65
124
  end
66
125
 
67
- def random_source_name
68
- rand(16**16).to_s(16)
69
- end
70
-
71
- def download_file(source)
72
- tmp_file = "/tmp/#{random_source_name}.pdf"
73
- File.open(tmp_file, 'wb') do |saved_file|
74
- open(URI.encode(source), 'rb') do |read_file|
75
- saved_file.write(read_file.read)
76
- end
77
- end
78
-
79
- tmp_file
80
- end
81
-
82
- def determine_source(source)
83
- is_file = File.exist?(source) && !File.directory?(source)
84
- is_http = URI(source).scheme == 'http'
85
- is_https = URI(source).scheme == 'https'
86
-
87
- unless is_file || is_http || is_https
88
- raise IOError, "Source (#{source}) is neither a file nor an URL."
89
- end
90
-
91
- is_file ? source : download_file(source)
126
+ def io_error(error_message)
127
+ raise IOError, error_message
92
128
  end
93
129
  end
94
130
 
95
131
  # Read infor from PDF file usin pdf-reader
96
132
  class Reader
97
133
  def initialize(source)
98
- @reader = PDF::Reader.new(source)
134
+ @reader = ::PDF::Reader.new(source)
99
135
  end
100
136
 
101
137
  def metadata
@@ -111,9 +147,11 @@ module Gaspar
111
147
  end
112
148
  end
113
149
 
114
- # options[:type]
115
- # {cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list}
116
150
  def self.parse(source, target, options = {})
117
151
  Parser.new(source, target, options).parse
118
152
  end
153
+
154
+ def self.parse_with_content(source, target, options = {})
155
+ Parser.new(source, target, options).parse_with_content
156
+ end
119
157
  end
@@ -1,3 +1,3 @@
1
1
  module Gaspar
2
- VERSION = '0.0.1'
2
+ VERSION = '0.0.2'
3
3
  end
@@ -2,35 +2,42 @@ require 'spec_helper'
2
2
 
3
3
  describe Gaspar do
4
4
  before(:all) do
5
- @table_pdf = file_path("table.pdf")
6
- @target_path = "#{Dir::tmpdir}"
7
- @table_html = @target_path + "/output.html"
5
+ @table_pdf = file_path('table.pdf')
6
+ @target_path = Dir.tmpdir.to_s
7
+ @table_html = @target_path + '/output.html'
8
8
  end
9
9
 
10
-
11
10
  describe '#parse' do
12
11
  describe 'with wrong params' do
13
12
  it 'should raise error if source file does not exists' do
14
- c = Gaspar::Parser.new('unknown.pdf', 'unknown.html')
15
- expect { c.parse }.to raise_error(IOError)
13
+ expect do
14
+ c = Gaspar::Parser.new('unknown.pdf', 'unknown.html')
15
+ c.parse
16
+ end.to raise_error(IOError)
16
17
  end
17
18
 
18
19
  it 'should raise error if source is not file nor url' do
19
- c = Gaspar::Parser.new('http:// /.pdf', 'unknown.html')
20
- expect { c.parse }.to raise_error(URI::InvalidURIError)
20
+ expect do
21
+ c = Gaspar::Parser.new('http:// /.pdf', 'unknown.html')
22
+ c.parse
23
+ end.to raise_error(URI::InvalidURIError)
21
24
  end
22
25
  end
23
26
 
24
27
  describe 'with write params' do
25
- it "should be possible to specify one page" do
26
- Gaspar::Parser.new(@table_pdf, @table_html, {
27
- page: 2, format: 'table_html'
28
- }).parse
28
+ it 'should be parsed' do
29
+ Gaspar::Parser.new(@table_pdf, @table_html,
30
+ format: 'table_html').parse
29
31
 
30
32
  doc = Nokogiri::HTML(File.open(@table_html))
31
33
  expect(doc.search('//comment()').text).not_to be_empty
32
34
  end
33
- end
34
35
 
36
+ it 'should get content' do
37
+ content = Gaspar::Parser.new(@table_pdf, @table_html,
38
+ format: 'table_html').parse_with_content
39
+ expect(content).not_to be_empty
40
+ end
41
+ end
35
42
  end
36
43
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gaspar-pdf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - 5rabbits