doc_ripper 0.0.7.2 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 78c69f5ebf8057f2b87109802a74fad71cbe63ea
4
- data.tar.gz: b685a42672f7bbf2a90cdba49035c6409f1d84f4
3
+ metadata.gz: 48d68fcd85b60d3ab64df91710c31158a998dc1a
4
+ data.tar.gz: 0aba15d63c943c4f33f5dbe14503907a72744a07
5
5
  SHA512:
6
- metadata.gz: 60533a3a30d444a10aebc1476d09f4ea029b69735c4629438f8bd58a38dd011c40fb5280d85b55e1194dde0b79331f44ef8c3cea61e40f59b489a4c932f22330
7
- data.tar.gz: 4aec11cc7a90bade2e9fd06a980d4cec6b67de9c6d472e9cae54ca14218b2fbbb157192cfba8bfc9740f9f620f2e9ffea7cfceb021c030eb00673cf27a77e1e0
6
+ metadata.gz: 104082f0efdf157273abda2e201b55cb315fe861005e4a14d358a09297a2f2b7b5d98e04fdd371196f85c6325f841ad82932bb33c123cc9432b35e9c5f45d572
7
+ data.tar.gz: 2f1b2d0c81f45ed03e197b0a0a2eaa41a057629f8c5c32f2107dc09233e67f12a141e5a19fa53ab8e97dd434e66f7fee18cddf1aa6c81bf600e974c9440383de
@@ -27,4 +27,7 @@ Gem::Specification.new do |spec|
27
27
  spec.add_development_dependency "bundler", "~> 1.6"
28
28
  spec.add_development_dependency "rake", "~> 10.0"
29
29
  spec.add_development_dependency "rspec"
30
+ spec.add_development_dependency "sqlite3"
31
+ spec.add_development_dependency "rubocop"
32
+ spec.add_development_dependency "pry"
30
33
  end
@@ -1,23 +1,46 @@
1
1
  require 'shellwords'
2
- require "doc_ripper/version"
3
- require "doc_ripper/base"
4
- require "doc_ripper/text_ripper"
5
- require "doc_ripper/formats/pdf_ripper"
6
- require "doc_ripper/formats/docx_ripper"
7
- require "doc_ripper/formats/ms_doc_ripper"
8
- require "doc_ripper/formats/sketch_ripper"
9
- require "doc_ripper/exceptions"
2
+ require 'doc_ripper/version'
3
+ require 'doc_ripper/base'
4
+ require 'doc_ripper/formats/text_ripper'
5
+ require 'doc_ripper/formats/pdf_ripper'
6
+ require 'doc_ripper/formats/docx_ripper'
7
+ require 'doc_ripper/formats/ms_doc_ripper'
8
+ require 'doc_ripper/formats/sketch_ripper'
9
+ require 'doc_ripper/exceptions'
10
10
 
11
11
  module DocRipper
12
12
  class << self
13
13
  def rip(path, options = {})
14
- TextRipper.new(path, options).text
14
+ ripper = choose_ripper(path)
15
+ ripper.text unless ripper.nil?
15
16
  end
16
17
 
17
18
  def rip!(path)
18
- text = rip(path, raise: true)
19
+ ripper = choose_ripper(path)
20
+ raise(UnsupportedFileType) if ripper.nil?
19
21
 
20
- text || raise(FileNotFound)
22
+ ripper.text || raise(FileNotFound)
23
+ end
24
+
25
+ private
26
+
27
+ def choose_ripper(file_path)
28
+ ripper = begin
29
+ case
30
+ when !!(file_path =~ /.docx$/i)
31
+ ripper = Formats::DocxRipper.new(file_path)
32
+ when !!(file_path =~ /.doc$/i)
33
+ ripper = Formats::MsDocRipper.new(file_path)
34
+ when !!(file_path =~ /.pdf$/i)
35
+ ripper = Formats::PdfRipper.new(file_path)
36
+ when !!(file_path =~ /.sketch$/i)
37
+ ripper = Formats::SketchRipper.new(file_path)
38
+ when !!(file_path =~ /.txt$/i)
39
+ ripper = Formats::TextRipper.new(file_path)
40
+ end
41
+ end
42
+
43
+ ripper
21
44
  end
22
45
  end
23
46
  end
@@ -2,16 +2,16 @@ module DocRipper
2
2
  module Ripper
3
3
 
4
4
  class Base
5
- attr_reader :text
5
+ attr_reader :file_path
6
6
 
7
- def initialize(file_path, options = {})
7
+ def initialize(file_path)
8
+ file_parts = file_path.split('.')
8
9
  @file_path = file_path
9
- @text_file_path = "#{file_path.split('.').first}.txt"
10
- @options = options
10
+ @extension = file_parts.last
11
11
  end
12
12
 
13
- def read_type
14
- :file
13
+ def text
14
+ @text ||= rip
15
15
  end
16
16
 
17
17
  private
@@ -22,4 +22,4 @@ module DocRipper
22
22
  end
23
23
 
24
24
  end
25
- end
25
+ end
@@ -3,9 +3,13 @@ module DocRipper
3
3
  class DocxRipper < Ripper::Base
4
4
 
5
5
  def rip
6
- @text ||= system(%Q[ unzip -p #{to_shell(@file_path)} | grep '<w:t' | sed 's/<[^<]*>//g' | grep -v '^[[:space:]]*$' > #{to_shell(@text_file_path)} ])
6
+ @text ||= begin
7
+ text = %x(unzip -p #{to_shell(file_path)} | grep '<w:t' | sed 's/<[^<]*>//g' | grep -v '^[[:space:]]*$')
8
+
9
+ text.empty? ? nil : text
10
+ end
7
11
  end
8
12
 
9
13
  end
10
14
  end
11
- end
15
+ end
@@ -3,9 +3,9 @@ module DocRipper
3
3
  class MsDocRipper < Ripper::Base
4
4
 
5
5
  def rip
6
- @text ||= system(%Q[ antiword #{to_shell(@file_path)} > #{to_shell(@text_file_path)} ])
6
+ @text ||= %x(antiword #{to_shell(file_path)})
7
7
  end
8
8
 
9
9
  end
10
10
  end
11
- end
11
+ end
@@ -3,9 +3,9 @@ module DocRipper
3
3
  class PdfRipper < Ripper::Base
4
4
 
5
5
  def rip
6
- @text ||= system(%Q[ pdftotext #{to_shell(@file_path)} > #{to_shell(@text_file_path)} ])
6
+ @text ||= %x(pdftotext #{to_shell(file_path)})
7
7
  end
8
8
 
9
9
  end
10
10
  end
11
- end
11
+ end
@@ -4,6 +4,7 @@ require 'colored'
4
4
  SQLITE_LOAD_WARNING = 'SQLite3 optional dependency not found. Sketch files are not supported in this mode.'.yellow
5
5
 
6
6
  begin
7
+ # gem 'sqlite3'
7
8
  require 'sqlite3'
8
9
  rescue LoadError
9
10
  warn SQLITE_LOAD_WARNING
@@ -45,17 +46,13 @@ module DocRipper
45
46
  end
46
47
  end
47
48
 
48
- def initialize(file_path, options= {})
49
+ def initialize(file_path)
49
50
  raise Sqlite3NotFound if !defined?(SQLite3)
50
51
  super
51
52
  end
52
53
 
53
- def read_type
54
- :mem
55
- end
56
-
57
54
  def rip
58
- db = SQLite3::Database.new(@file_path)
55
+ db = SQLite3::Database.new(file_path)
59
56
  data = db.execute("SELECT value FROM payload").flatten.first
60
57
  @text ||= text_objects(data).join(" ").strip
61
58
  end
@@ -0,0 +1,13 @@
1
+ #encoding: UTF-8
2
+
3
+ module DocRipper
4
+ module Formats
5
+ class TextRipper < Ripper::Base
6
+
7
+ def rip
8
+ File.read(@file_path)
9
+ end
10
+
11
+ end
12
+ end
13
+ end
@@ -1,5 +1,3 @@
1
1
  module DocRipper
2
- VERSION = "0.0.7.2"
2
+ VERSION = "0.0.8"
3
3
  end
4
-
5
-
@@ -10,8 +10,38 @@ module DocRipper
10
10
  let(:missing_path) { "#{
11
11
  FIXTURE_PATH}some_missing_path.docx" }
12
12
 
13
- context '#rip' do
13
+ describe 'full utf-8 encoding' do
14
+ let(:result) { DocRipper.rip(file_path) }
15
+
16
+ context 'txt file' do
17
+ let(:str) { '¿Cuál es su nombre?' }
18
+ let(:file_path) { "#{FIXTURE_PATH}encoding_sample.txt" }
19
+
20
+ before(:each) do
21
+ File.write(file_path, str)
22
+ end
23
+
24
+ it 'maintains encoding' do
25
+ expect(result).to eq(str)
26
+ end
27
+
28
+ after(:each) do
29
+ File.delete(file_path)
30
+ end
31
+ end
32
+
33
+ describe 'docx file' do
34
+ let(:str) { '四、我们确认,我们完全同意招标文件制定的投标规则,并承诺按照这些规则履行我们的所有义务,包括一旦投标文件被贵方接受,将履行社会资本合作方的义务' }
35
+ let(:file_path) { "#{FIXTURE_PATH}chinese.docx" }
14
36
 
37
+ it 'maintains encoding' do
38
+ puts result.encoding
39
+ expect(result).to include(str)
40
+ end
41
+ end
42
+ end
43
+
44
+ context '#rip' do
15
45
  it 'should respond to #rip' do
16
46
  expect(DocRipper.respond_to? :rip).to eq(true)
17
47
  end
@@ -34,7 +64,6 @@ module DocRipper
34
64
  it 'should remove the dumped text version of the file' do
35
65
 
36
66
  end
37
-
38
67
  end
39
68
 
40
69
  context '#rip!' do
@@ -51,7 +80,5 @@ module DocRipper
51
80
  end
52
81
 
53
82
  end
54
-
55
-
56
83
  end
57
- end
84
+ end
@@ -26,4 +26,4 @@ module DocRipper
26
26
  end
27
27
  end
28
28
  end
29
- end
29
+ end
@@ -0,0 +1,9 @@
1
+
2
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod
3
+ tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim
4
+ veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea
5
+ commodo consequat. Duis aute irure dolor in reprehenderit in voluptate
6
+ velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat
7
+ cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id
8
+ est laborum.
9
+
@@ -1,4 +1,6 @@
1
1
  require 'bundler/setup'
2
+ require 'pry'
3
+
2
4
  Bundler.setup
3
5
 
4
6
  require 'doc_ripper'
@@ -8,4 +10,4 @@ FIXTURE_PATH = "#{File.expand_path '../',__FILE__}/fixtures/"
8
10
 
9
11
  RSpec.configure do |config|
10
12
  # some (optional) config here
11
- end
13
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: doc_ripper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7.2
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Zaich
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-15 00:00:00.000000000 Z
11
+ date: 2017-11-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: CFPropertyList
@@ -80,6 +80,48 @@ dependencies:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: sqlite3
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rubocop
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: pry
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
83
125
  description: Scrape text from common file formats (.pdf,.doc,.docx, .sketch, .txt)
84
126
  with a single convenient command.
85
127
  email:
@@ -102,20 +144,21 @@ files:
102
144
  - lib/doc_ripper/formats/ms_doc_ripper.rb
103
145
  - lib/doc_ripper/formats/pdf_ripper.rb
104
146
  - lib/doc_ripper/formats/sketch_ripper.rb
105
- - lib/doc_ripper/text_ripper.rb
147
+ - lib/doc_ripper/formats/text_ripper.rb
106
148
  - lib/doc_ripper/version.rb
107
149
  - pkg/doc_ripper-0.0.5.gem
108
150
  - pkg/doc_ripper-0.0.6.gem
109
151
  - pkg/doc_ripper-0.0.7.1.gem
152
+ - pkg/doc_ripper-0.0.7.2.gem
110
153
  - pkg/doc_ripper-0.0.7.gem
111
- - spec/doc_ripper/base_spec.rb
112
- - spec/doc_ripper/formats/doc_ripper_spec.rb
154
+ - spec/doc_ripper/doc_ripper_spec.rb
113
155
  - spec/doc_ripper/formats/sketch_ripper_spec.rb
114
- - spec/doc_ripper/formats/text_ripper_spec.rb
156
+ - spec/fixtures/chinese.docx
115
157
  - spec/fixtures/complex_sketch_text.sketch
116
158
  - spec/fixtures/lorem.doc
117
159
  - spec/fixtures/lorem.docx
118
160
  - spec/fixtures/lorem.pdf
161
+ - spec/fixtures/lorem.txt
119
162
  - spec/fixtures/missing_file.txt
120
163
  - spec/fixtures/simple_sketch_text.sketch
121
164
  - spec/fixtures/some_missing_path.txt
@@ -142,19 +185,19 @@ requirements:
142
185
  - Antiword
143
186
  - pdftotext/poppler
144
187
  rubyforge_project:
145
- rubygems_version: 2.6.6
188
+ rubygems_version: 2.6.14
146
189
  signing_key:
147
190
  specification_version: 4
148
191
  summary: Rip out text from pdf, doc and docx formats
149
192
  test_files:
150
- - spec/doc_ripper/base_spec.rb
151
- - spec/doc_ripper/formats/doc_ripper_spec.rb
193
+ - spec/doc_ripper/doc_ripper_spec.rb
152
194
  - spec/doc_ripper/formats/sketch_ripper_spec.rb
153
- - spec/doc_ripper/formats/text_ripper_spec.rb
195
+ - spec/fixtures/chinese.docx
154
196
  - spec/fixtures/complex_sketch_text.sketch
155
197
  - spec/fixtures/lorem.doc
156
198
  - spec/fixtures/lorem.docx
157
199
  - spec/fixtures/lorem.pdf
200
+ - spec/fixtures/lorem.txt
158
201
  - spec/fixtures/missing_file.txt
159
202
  - spec/fixtures/simple_sketch_text.sketch
160
203
  - spec/fixtures/some_missing_path.txt
@@ -1,43 +0,0 @@
1
- #encoding: UTF-8
2
-
3
- module DocRipper
4
- class TextRipper < Ripper::Base
5
- attr_reader :text_file_path, :file_path
6
-
7
- def ripped?
8
- @is_ripped ||=choose_ripper
9
- end
10
-
11
- def text
12
- if ripped? && @ripper.read_type == :file
13
- @text = IO.read(@text_file_path).force_encoding("ISO-8859-1").encode("utf-8", replace: nil)
14
- File.delete(@text_file_path)
15
-
16
- elsif ripped? && @ripper.read_type == :mem
17
- @text = @ripper.text
18
- end
19
-
20
- @text
21
- end
22
-
23
- private
24
-
25
- def choose_ripper
26
- case
27
- when !!(@file_path =~ /.docx$/i)
28
- @ripper = Formats::DocxRipper.new(@file_path)
29
- when !!(@file_path =~ /.doc$/i)
30
- @ripper = Formats::MsDocRipper.new(@file_path)
31
- when !!(@file_path =~ /.pdf$/i)
32
- @ripper = Formats::PdfRipper.new(@file_path)
33
- when !!(@file_path =~ /.sketch$/i)
34
- @ripper = Formats::SketchRipper.new(@file_path)
35
- when @options[:raise]
36
- raise UnsupportedFileType
37
- end
38
-
39
- @ripper.rip
40
- end
41
-
42
- end
43
- end
@@ -1,9 +0,0 @@
1
- require 'spec_helper'
2
-
3
- module DocRipper
4
- module Ripper
5
- describe 'Base' do
6
-
7
- end
8
- end
9
- end
@@ -1,7 +0,0 @@
1
- require 'spec_helper'
2
-
3
- module DocRipper
4
- describe 'TextRipper' do
5
-
6
- end
7
- end