textractor 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +1 -1
- data/README.md +13 -1
- data/lib/textractor.rb +13 -8
- data/lib/textractor/content_type_detector.rb +7 -2
- data/lib/textractor/{mimetype_fu_content_type_detector.rb → content_type_detector/mimetype_fu.rb} +2 -2
- data/lib/textractor/{simple_content_type_detector.rb → content_type_detector/simple.rb} +2 -2
- data/lib/textractor/version.rb +1 -1
- data/spec/textractor_spec.rb +10 -0
- metadata +5 -6
- data/lib/textractor/extractors/word_extractor.rb +0 -29
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -20,7 +20,11 @@ I recommend using also passing +no_x11 to the install command, but this may not
|
|
20
20
|
|
21
21
|
apt-get install wv xpdf-utils links
|
22
22
|
|
23
|
-
###
|
23
|
+
### Perl (*sigh*)
|
24
|
+
|
25
|
+
Yes, this is slightly ridiculous, but a working perl installation is required in order to extract text from a docx file.
|
26
|
+
|
27
|
+
### mimetype-fu (optional)
|
24
28
|
|
25
29
|
gem install mimetype-fu
|
26
30
|
|
@@ -53,6 +57,14 @@ It's possible to define additional extractors for additional content types. An
|
|
53
57
|
|
54
58
|
Textractor.register_content_type("text/html", HTMLExtractor)
|
55
59
|
|
60
|
+
It is also possible to use a block as a simple content type extractor:
|
61
|
+
|
62
|
+
Textractor.register_content_type("text/html") do |path|
|
63
|
+
data = File.read(path)
|
64
|
+
document = Nokogiri::HTML(data)
|
65
|
+
document.text
|
66
|
+
end
|
67
|
+
|
56
68
|
You can also remove a content type extractor:
|
57
69
|
|
58
70
|
Textractor.remove_content_type("text/html")
|
data/lib/textractor.rb
CHANGED
@@ -5,17 +5,18 @@ module Textractor
|
|
5
5
|
ContentTypeAlreadyRegistered = Class.new(StandardError)
|
6
6
|
ContentTypeNotRegistered = Class.new(StandardError)
|
7
7
|
|
8
|
-
autoload :Extractors,
|
9
|
-
autoload :SimpleContentTypeDetector, 'textractor/simple_content_type_detector'
|
10
|
-
autoload :MimetypeFuContentTypeDetector, 'textractor/mimetype_fu_content_type_detector'
|
8
|
+
autoload :Extractors, 'textractor/extractors'
|
11
9
|
|
12
10
|
def self.text_from_path(path, options = {})
|
13
11
|
raise FileNotFound unless File.exists?(path)
|
14
12
|
content_type = options.fetch(:content_type) { content_type_for_path(path) }
|
15
|
-
|
16
|
-
extractor = extractor_class.new
|
13
|
+
extractor = extractor_for_content_type(content_type)
|
17
14
|
|
18
|
-
extractor.
|
15
|
+
if extractor.is_a?(Proc)
|
16
|
+
extractor.call(path)
|
17
|
+
else
|
18
|
+
extractor.new.text_from_path(path)
|
19
|
+
end
|
19
20
|
end
|
20
21
|
|
21
22
|
class << self
|
@@ -26,9 +27,13 @@ module Textractor
|
|
26
27
|
content_type_detector.content_type_for_path(path) or raise UnknownContentType, "unable to determine content type for #{path}"
|
27
28
|
end
|
28
29
|
|
29
|
-
def self.register_content_type(content_type, extractor)
|
30
|
+
def self.register_content_type(content_type, extractor = nil, &block)
|
30
31
|
raise ContentTypeAlreadyRegistered, "#{content_type} is already registered" if extractors[content_type]
|
31
|
-
|
32
|
+
if extractor
|
33
|
+
extractors[content_type] = extractor
|
34
|
+
elsif block_given?
|
35
|
+
extractors[content_type] = block
|
36
|
+
end
|
32
37
|
end
|
33
38
|
|
34
39
|
def self.remove_content_type(content_type)
|
@@ -1,9 +1,14 @@
|
|
1
|
+
module Textractor::ContentTypeDetector
|
2
|
+
autoload :Simple, 'textractor/content_type_detector/simple'
|
3
|
+
autoload :MimetypeFu, 'textractor/content_type_detector/mimetype_fu'
|
4
|
+
end
|
5
|
+
|
1
6
|
begin
|
2
7
|
require 'rubygems'
|
3
8
|
require 'yaml'
|
4
9
|
require 'mimetype_fu'
|
5
10
|
|
6
|
-
Textractor.content_type_detector = Textractor::
|
11
|
+
Textractor.content_type_detector = Textractor::ContentTypeDetector::MimetypeFu
|
7
12
|
rescue LoadError => e
|
8
|
-
Textractor.content_type_detector = Textractor::
|
13
|
+
Textractor.content_type_detector = Textractor::ContentTypeDetector::Simple
|
9
14
|
end
|
data/lib/textractor/version.rb
CHANGED
data/spec/textractor_spec.rb
CHANGED
@@ -47,6 +47,16 @@ describe Textractor do
|
|
47
47
|
}.to raise_error(Textractor::ContentTypeAlreadyRegistered)
|
48
48
|
end
|
49
49
|
|
50
|
+
it 'takes a block for simple cases' do
|
51
|
+
File.stub(:exists?).and_return(true)
|
52
|
+
Textractor.stub(:content_type_for_path).and_return('test')
|
53
|
+
Textractor.register_content_type('test') do |path|
|
54
|
+
path
|
55
|
+
end
|
56
|
+
|
57
|
+
Textractor.text_from_path('document').should == 'document'
|
58
|
+
end
|
59
|
+
|
50
60
|
end
|
51
61
|
|
52
62
|
describe ".extractor_for_content_type" do
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 4
|
10
|
+
version: 0.1.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Michael Guterl
|
@@ -71,14 +71,13 @@ files:
|
|
71
71
|
- bin/textractor
|
72
72
|
- lib/textractor.rb
|
73
73
|
- lib/textractor/content_type_detector.rb
|
74
|
+
- lib/textractor/content_type_detector/mimetype_fu.rb
|
75
|
+
- lib/textractor/content_type_detector/simple.rb
|
74
76
|
- lib/textractor/extractors.rb
|
75
77
|
- lib/textractor/extractors/doc_extractor.rb
|
76
78
|
- lib/textractor/extractors/docx_extractor.rb
|
77
79
|
- lib/textractor/extractors/pdf_extractor.rb
|
78
80
|
- lib/textractor/extractors/text_extractor.rb
|
79
|
-
- lib/textractor/extractors/word_extractor.rb
|
80
|
-
- lib/textractor/mimetype_fu_content_type_detector.rb
|
81
|
-
- lib/textractor/simple_content_type_detector.rb
|
82
81
|
- lib/textractor/version.rb
|
83
82
|
- spec/fixtures/document.doc
|
84
83
|
- spec/fixtures/document.docx
|
@@ -1,29 +0,0 @@
|
|
1
|
-
module Textractor::Extractors
|
2
|
-
|
3
|
-
class WordExtractor
|
4
|
-
|
5
|
-
DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../support/wvText.xml").freeze
|
6
|
-
|
7
|
-
class << self
|
8
|
-
attr_writer :wvText_path
|
9
|
-
|
10
|
-
def wvText_path
|
11
|
-
@wvText_path || DEFAULT_WV_TEXT_PATH
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
def text_from_path(path)
|
16
|
-
command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
|
17
|
-
puts command if $DEBUG
|
18
|
-
`#{command}`.strip
|
19
|
-
end
|
20
|
-
|
21
|
-
private
|
22
|
-
|
23
|
-
def wvText_path
|
24
|
-
self.class.wvText_path
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|