doc_ripper 0.0.7.2 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 78c69f5ebf8057f2b87109802a74fad71cbe63ea
4
- data.tar.gz: b685a42672f7bbf2a90cdba49035c6409f1d84f4
3
+ metadata.gz: 48d68fcd85b60d3ab64df91710c31158a998dc1a
4
+ data.tar.gz: 0aba15d63c943c4f33f5dbe14503907a72744a07
5
5
  SHA512:
6
- metadata.gz: 60533a3a30d444a10aebc1476d09f4ea029b69735c4629438f8bd58a38dd011c40fb5280d85b55e1194dde0b79331f44ef8c3cea61e40f59b489a4c932f22330
7
- data.tar.gz: 4aec11cc7a90bade2e9fd06a980d4cec6b67de9c6d472e9cae54ca14218b2fbbb157192cfba8bfc9740f9f620f2e9ffea7cfceb021c030eb00673cf27a77e1e0
6
+ metadata.gz: 104082f0efdf157273abda2e201b55cb315fe861005e4a14d358a09297a2f2b7b5d98e04fdd371196f85c6325f841ad82932bb33c123cc9432b35e9c5f45d572
7
+ data.tar.gz: 2f1b2d0c81f45ed03e197b0a0a2eaa41a057629f8c5c32f2107dc09233e67f12a141e5a19fa53ab8e97dd434e66f7fee18cddf1aa6c81bf600e974c9440383de
@@ -27,4 +27,7 @@ Gem::Specification.new do |spec|
27
27
  spec.add_development_dependency "bundler", "~> 1.6"
28
28
  spec.add_development_dependency "rake", "~> 10.0"
29
29
  spec.add_development_dependency "rspec"
30
+ spec.add_development_dependency "sqlite3"
31
+ spec.add_development_dependency "rubocop"
32
+ spec.add_development_dependency "pry"
30
33
  end
@@ -1,23 +1,46 @@
1
1
  require 'shellwords'
2
- require "doc_ripper/version"
3
- require "doc_ripper/base"
4
- require "doc_ripper/text_ripper"
5
- require "doc_ripper/formats/pdf_ripper"
6
- require "doc_ripper/formats/docx_ripper"
7
- require "doc_ripper/formats/ms_doc_ripper"
8
- require "doc_ripper/formats/sketch_ripper"
9
- require "doc_ripper/exceptions"
2
+ require 'doc_ripper/version'
3
+ require 'doc_ripper/base'
4
+ require 'doc_ripper/formats/text_ripper'
5
+ require 'doc_ripper/formats/pdf_ripper'
6
+ require 'doc_ripper/formats/docx_ripper'
7
+ require 'doc_ripper/formats/ms_doc_ripper'
8
+ require 'doc_ripper/formats/sketch_ripper'
9
+ require 'doc_ripper/exceptions'
10
10
 
11
11
  module DocRipper
12
12
  class << self
13
13
  def rip(path, options = {})
14
- TextRipper.new(path, options).text
14
+ ripper = choose_ripper(path)
15
+ ripper.text unless ripper.nil?
15
16
  end
16
17
 
17
18
  def rip!(path)
18
- text = rip(path, raise: true)
19
+ ripper = choose_ripper(path)
20
+ raise(UnsupportedFileType) if ripper.nil?
19
21
 
20
- text || raise(FileNotFound)
22
+ ripper.text || raise(FileNotFound)
23
+ end
24
+
25
+ private
26
+
27
+ def choose_ripper(file_path)
28
+ ripper = begin
29
+ case
30
+ when !!(file_path =~ /.docx$/i)
31
+ ripper = Formats::DocxRipper.new(file_path)
32
+ when !!(file_path =~ /.doc$/i)
33
+ ripper = Formats::MsDocRipper.new(file_path)
34
+ when !!(file_path =~ /.pdf$/i)
35
+ ripper = Formats::PdfRipper.new(file_path)
36
+ when !!(file_path =~ /.sketch$/i)
37
+ ripper = Formats::SketchRipper.new(file_path)
38
+ when !!(file_path =~ /.txt$/i)
39
+ ripper = Formats::TextRipper.new(file_path)
40
+ end
41
+ end
42
+
43
+ ripper
21
44
  end
22
45
  end
23
46
  end
@@ -2,16 +2,16 @@ module DocRipper
2
2
  module Ripper
3
3
 
4
4
  class Base
5
- attr_reader :text
5
+ attr_reader :file_path
6
6
 
7
- def initialize(file_path, options = {})
7
+ def initialize(file_path)
8
+ file_parts = file_path.split('.')
8
9
  @file_path = file_path
9
- @text_file_path = "#{file_path.split('.').first}.txt"
10
- @options = options
10
+ @extension = file_parts.last
11
11
  end
12
12
 
13
- def read_type
14
- :file
13
+ def text
14
+ @text ||= rip
15
15
  end
16
16
 
17
17
  private
@@ -22,4 +22,4 @@ module DocRipper
22
22
  end
23
23
 
24
24
  end
25
- end
25
+ end
@@ -3,9 +3,13 @@ module DocRipper
3
3
  class DocxRipper < Ripper::Base
4
4
 
5
5
  def rip
6
- @text ||= system(%Q[ unzip -p #{to_shell(@file_path)} | grep '<w:t' | sed 's/<[^<]*>//g' | grep -v '^[[:space:]]*$' > #{to_shell(@text_file_path)} ])
6
+ @text ||= begin
7
+ text = %x(unzip -p #{to_shell(file_path)} | grep '<w:t' | sed 's/<[^<]*>//g' | grep -v '^[[:space:]]*$')
8
+
9
+ text.empty? ? nil : text
10
+ end
7
11
  end
8
12
 
9
13
  end
10
14
  end
11
- end
15
+ end
@@ -3,9 +3,9 @@ module DocRipper
3
3
  class MsDocRipper < Ripper::Base
4
4
 
5
5
  def rip
6
- @text ||= system(%Q[ antiword #{to_shell(@file_path)} > #{to_shell(@text_file_path)} ])
6
+ @text ||= %x(antiword #{to_shell(file_path)})
7
7
  end
8
8
 
9
9
  end
10
10
  end
11
- end
11
+ end
@@ -3,9 +3,9 @@ module DocRipper
3
3
  class PdfRipper < Ripper::Base
4
4
 
5
5
  def rip
6
- @text ||= system(%Q[ pdftotext #{to_shell(@file_path)} > #{to_shell(@text_file_path)} ])
6
+ @text ||= %x(pdftotext #{to_shell(file_path)})
7
7
  end
8
8
 
9
9
  end
10
10
  end
11
- end
11
+ end
@@ -4,6 +4,7 @@ require 'colored'
4
4
  SQLITE_LOAD_WARNING = 'SQLite3 optional dependency not found. Sketch files are not supported in this mode.'.yellow
5
5
 
6
6
  begin
7
+ # gem 'sqlite3'
7
8
  require 'sqlite3'
8
9
  rescue LoadError
9
10
  warn SQLITE_LOAD_WARNING
@@ -45,17 +46,13 @@ module DocRipper
45
46
  end
46
47
  end
47
48
 
48
- def initialize(file_path, options= {})
49
+ def initialize(file_path)
49
50
  raise Sqlite3NotFound if !defined?(SQLite3)
50
51
  super
51
52
  end
52
53
 
53
- def read_type
54
- :mem
55
- end
56
-
57
54
  def rip
58
- db = SQLite3::Database.new(@file_path)
55
+ db = SQLite3::Database.new(file_path)
59
56
  data = db.execute("SELECT value FROM payload").flatten.first
60
57
  @text ||= text_objects(data).join(" ").strip
61
58
  end
@@ -0,0 +1,13 @@
1
+ #encoding: UTF-8
2
+
3
+ module DocRipper
4
+ module Formats
5
+ class TextRipper < Ripper::Base
6
+
7
+ def rip
8
+ File.read(@file_path)
9
+ end
10
+
11
+ end
12
+ end
13
+ end
@@ -1,5 +1,3 @@
1
1
  module DocRipper
2
- VERSION = "0.0.7.2"
2
+ VERSION = "0.0.8"
3
3
  end
4
-
5
-
@@ -10,8 +10,38 @@ module DocRipper
10
10
  let(:missing_path) { "#{
11
11
  FIXTURE_PATH}some_missing_path.docx" }
12
12
 
13
- context '#rip' do
13
+ describe 'full utf-8 encoding' do
14
+ let(:result) { DocRipper.rip(file_path) }
15
+
16
+ context 'txt file' do
17
+ let(:str) { '¿Cuál es su nombre?' }
18
+ let(:file_path) { "#{FIXTURE_PATH}encoding_sample.txt" }
19
+
20
+ before(:each) do
21
+ File.write(file_path, str)
22
+ end
23
+
24
+ it 'maintains encoding' do
25
+ expect(result).to eq(str)
26
+ end
27
+
28
+ after(:each) do
29
+ File.delete(file_path)
30
+ end
31
+ end
32
+
33
+ describe 'docx file' do
34
+ let(:str) { '四、我们确认,我们完全同意招标文件制定的投标规则,并承诺按照这些规则履行我们的所有义务,包括一旦投标文件被贵方接受,将履行社会资本合作方的义务' }
35
+ let(:file_path) { "#{FIXTURE_PATH}chinese.docx" }
14
36
 
37
+ it 'maintains encoding' do
38
+ puts result.encoding
39
+ expect(result).to include(str)
40
+ end
41
+ end
42
+ end
43
+
44
+ context '#rip' do
15
45
  it 'should respond to #rip' do
16
46
  expect(DocRipper.respond_to? :rip).to eq(true)
17
47
  end
@@ -34,7 +64,6 @@ module DocRipper
34
64
  it 'should remove the dumped text version of the file' do
35
65
 
36
66
  end
37
-
38
67
  end
39
68
 
40
69
  context '#rip!' do
@@ -51,7 +80,5 @@ module DocRipper
51
80
  end
52
81
 
53
82
  end
54
-
55
-
56
83
  end
57
- end
84
+ end
@@ -26,4 +26,4 @@ module DocRipper
26
26
  end
27
27
  end
28
28
  end
29
- end
29
+ end
@@ -0,0 +1,9 @@
1
+
2
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod
3
+ tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim
4
+ veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea
5
+ commodo consequat. Duis aute irure dolor in reprehenderit in voluptate
6
+ velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat
7
+ cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id
8
+ est laborum.
9
+
@@ -1,4 +1,6 @@
1
1
  require 'bundler/setup'
2
+ require 'pry'
3
+
2
4
  Bundler.setup
3
5
 
4
6
  require 'doc_ripper'
@@ -8,4 +10,4 @@ FIXTURE_PATH = "#{File.expand_path '../',__FILE__}/fixtures/"
8
10
 
9
11
  RSpec.configure do |config|
10
12
  # some (optional) config here
11
- end
13
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: doc_ripper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7.2
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Zaich
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-15 00:00:00.000000000 Z
11
+ date: 2017-11-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: CFPropertyList
@@ -80,6 +80,48 @@ dependencies:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: sqlite3
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rubocop
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: pry
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
83
125
  description: Scrape text from common file formats (.pdf,.doc,.docx, .sketch, .txt)
84
126
  with a single convenient command.
85
127
  email:
@@ -102,20 +144,21 @@ files:
102
144
  - lib/doc_ripper/formats/ms_doc_ripper.rb
103
145
  - lib/doc_ripper/formats/pdf_ripper.rb
104
146
  - lib/doc_ripper/formats/sketch_ripper.rb
105
- - lib/doc_ripper/text_ripper.rb
147
+ - lib/doc_ripper/formats/text_ripper.rb
106
148
  - lib/doc_ripper/version.rb
107
149
  - pkg/doc_ripper-0.0.5.gem
108
150
  - pkg/doc_ripper-0.0.6.gem
109
151
  - pkg/doc_ripper-0.0.7.1.gem
152
+ - pkg/doc_ripper-0.0.7.2.gem
110
153
  - pkg/doc_ripper-0.0.7.gem
111
- - spec/doc_ripper/base_spec.rb
112
- - spec/doc_ripper/formats/doc_ripper_spec.rb
154
+ - spec/doc_ripper/doc_ripper_spec.rb
113
155
  - spec/doc_ripper/formats/sketch_ripper_spec.rb
114
- - spec/doc_ripper/formats/text_ripper_spec.rb
156
+ - spec/fixtures/chinese.docx
115
157
  - spec/fixtures/complex_sketch_text.sketch
116
158
  - spec/fixtures/lorem.doc
117
159
  - spec/fixtures/lorem.docx
118
160
  - spec/fixtures/lorem.pdf
161
+ - spec/fixtures/lorem.txt
119
162
  - spec/fixtures/missing_file.txt
120
163
  - spec/fixtures/simple_sketch_text.sketch
121
164
  - spec/fixtures/some_missing_path.txt
@@ -142,19 +185,19 @@ requirements:
142
185
  - Antiword
143
186
  - pdftotext/poppler
144
187
  rubyforge_project:
145
- rubygems_version: 2.6.6
188
+ rubygems_version: 2.6.14
146
189
  signing_key:
147
190
  specification_version: 4
148
191
  summary: Rip out text from pdf, doc and docx formats
149
192
  test_files:
150
- - spec/doc_ripper/base_spec.rb
151
- - spec/doc_ripper/formats/doc_ripper_spec.rb
193
+ - spec/doc_ripper/doc_ripper_spec.rb
152
194
  - spec/doc_ripper/formats/sketch_ripper_spec.rb
153
- - spec/doc_ripper/formats/text_ripper_spec.rb
195
+ - spec/fixtures/chinese.docx
154
196
  - spec/fixtures/complex_sketch_text.sketch
155
197
  - spec/fixtures/lorem.doc
156
198
  - spec/fixtures/lorem.docx
157
199
  - spec/fixtures/lorem.pdf
200
+ - spec/fixtures/lorem.txt
158
201
  - spec/fixtures/missing_file.txt
159
202
  - spec/fixtures/simple_sketch_text.sketch
160
203
  - spec/fixtures/some_missing_path.txt
@@ -1,43 +0,0 @@
1
- #encoding: UTF-8
2
-
3
- module DocRipper
4
- class TextRipper < Ripper::Base
5
- attr_reader :text_file_path, :file_path
6
-
7
- def ripped?
8
- @is_ripped ||=choose_ripper
9
- end
10
-
11
- def text
12
- if ripped? && @ripper.read_type == :file
13
- @text = IO.read(@text_file_path).force_encoding("ISO-8859-1").encode("utf-8", replace: nil)
14
- File.delete(@text_file_path)
15
-
16
- elsif ripped? && @ripper.read_type == :mem
17
- @text = @ripper.text
18
- end
19
-
20
- @text
21
- end
22
-
23
- private
24
-
25
- def choose_ripper
26
- case
27
- when !!(@file_path =~ /.docx$/i)
28
- @ripper = Formats::DocxRipper.new(@file_path)
29
- when !!(@file_path =~ /.doc$/i)
30
- @ripper = Formats::MsDocRipper.new(@file_path)
31
- when !!(@file_path =~ /.pdf$/i)
32
- @ripper = Formats::PdfRipper.new(@file_path)
33
- when !!(@file_path =~ /.sketch$/i)
34
- @ripper = Formats::SketchRipper.new(@file_path)
35
- when @options[:raise]
36
- raise UnsupportedFileType
37
- end
38
-
39
- @ripper.rip
40
- end
41
-
42
- end
43
- end
@@ -1,9 +0,0 @@
1
- require 'spec_helper'
2
-
3
- module DocRipper
4
- module Ripper
5
- describe 'Base' do
6
-
7
- end
8
- end
9
- end
@@ -1,7 +0,0 @@
1
- require 'spec_helper'
2
-
3
- module DocRipper
4
- describe 'TextRipper' do
5
-
6
- end
7
- end