textractor 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +1 -1
- data/README.md +13 -1
- data/lib/textractor.rb +13 -8
- data/lib/textractor/content_type_detector.rb +7 -2
- data/lib/textractor/{mimetype_fu_content_type_detector.rb → content_type_detector/mimetype_fu.rb} +2 -2
- data/lib/textractor/{simple_content_type_detector.rb → content_type_detector/simple.rb} +2 -2
- data/lib/textractor/version.rb +1 -1
- data/spec/textractor_spec.rb +10 -0
- metadata +5 -6
- data/lib/textractor/extractors/word_extractor.rb +0 -29
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -20,7 +20,11 @@ I recommend using also passing +no_x11 to the install command, but this may not
|
|
20
20
|
|
21
21
|
apt-get install wv xpdf-utils links
|
22
22
|
|
23
|
-
###
|
23
|
+
### Perl (*sigh*)
|
24
|
+
|
25
|
+
Yes, this is slightly ridiculous, but a working perl installation is required in order to extract text from a docx file.
|
26
|
+
|
27
|
+
### mimetype-fu (optional)
|
24
28
|
|
25
29
|
gem install mimetype-fu
|
26
30
|
|
@@ -53,6 +57,14 @@ It's possible to define additional extractors for additional content types. An
|
|
53
57
|
|
54
58
|
Textractor.register_content_type("text/html", HTMLExtractor)
|
55
59
|
|
60
|
+
It is also possible to use a block as a simple content type extractor:
|
61
|
+
|
62
|
+
Textractor.register_content_type("text/html") do |path|
|
63
|
+
data = File.read(path)
|
64
|
+
document = Nokogiri::HTML(data)
|
65
|
+
document.text
|
66
|
+
end
|
67
|
+
|
56
68
|
You can also remove a content type extractor:
|
57
69
|
|
58
70
|
Textractor.remove_content_type("text/html")
|
data/lib/textractor.rb
CHANGED
@@ -5,17 +5,18 @@ module Textractor
|
|
5
5
|
ContentTypeAlreadyRegistered = Class.new(StandardError)
|
6
6
|
ContentTypeNotRegistered = Class.new(StandardError)
|
7
7
|
|
8
|
-
autoload :Extractors,
|
9
|
-
autoload :SimpleContentTypeDetector, 'textractor/simple_content_type_detector'
|
10
|
-
autoload :MimetypeFuContentTypeDetector, 'textractor/mimetype_fu_content_type_detector'
|
8
|
+
autoload :Extractors, 'textractor/extractors'
|
11
9
|
|
12
10
|
def self.text_from_path(path, options = {})
|
13
11
|
raise FileNotFound unless File.exists?(path)
|
14
12
|
content_type = options.fetch(:content_type) { content_type_for_path(path) }
|
15
|
-
|
16
|
-
extractor = extractor_class.new
|
13
|
+
extractor = extractor_for_content_type(content_type)
|
17
14
|
|
18
|
-
extractor.
|
15
|
+
if extractor.is_a?(Proc)
|
16
|
+
extractor.call(path)
|
17
|
+
else
|
18
|
+
extractor.new.text_from_path(path)
|
19
|
+
end
|
19
20
|
end
|
20
21
|
|
21
22
|
class << self
|
@@ -26,9 +27,13 @@ module Textractor
|
|
26
27
|
content_type_detector.content_type_for_path(path) or raise UnknownContentType, "unable to determine content type for #{path}"
|
27
28
|
end
|
28
29
|
|
29
|
-
def self.register_content_type(content_type, extractor)
|
30
|
+
def self.register_content_type(content_type, extractor = nil, &block)
|
30
31
|
raise ContentTypeAlreadyRegistered, "#{content_type} is already registered" if extractors[content_type]
|
31
|
-
|
32
|
+
if extractor
|
33
|
+
extractors[content_type] = extractor
|
34
|
+
elsif block_given?
|
35
|
+
extractors[content_type] = block
|
36
|
+
end
|
32
37
|
end
|
33
38
|
|
34
39
|
def self.remove_content_type(content_type)
|
@@ -1,9 +1,14 @@
|
|
1
|
+
module Textractor::ContentTypeDetector
|
2
|
+
autoload :Simple, 'textractor/content_type_detector/simple'
|
3
|
+
autoload :MimetypeFu, 'textractor/content_type_detector/mimetype_fu'
|
4
|
+
end
|
5
|
+
|
1
6
|
begin
|
2
7
|
require 'rubygems'
|
3
8
|
require 'yaml'
|
4
9
|
require 'mimetype_fu'
|
5
10
|
|
6
|
-
Textractor.content_type_detector = Textractor::
|
11
|
+
Textractor.content_type_detector = Textractor::ContentTypeDetector::MimetypeFu
|
7
12
|
rescue LoadError => e
|
8
|
-
Textractor.content_type_detector = Textractor::
|
13
|
+
Textractor.content_type_detector = Textractor::ContentTypeDetector::Simple
|
9
14
|
end
|
data/lib/textractor/version.rb
CHANGED
data/spec/textractor_spec.rb
CHANGED
@@ -47,6 +47,16 @@ describe Textractor do
|
|
47
47
|
}.to raise_error(Textractor::ContentTypeAlreadyRegistered)
|
48
48
|
end
|
49
49
|
|
50
|
+
it 'takes a block for simple cases' do
|
51
|
+
File.stub(:exists?).and_return(true)
|
52
|
+
Textractor.stub(:content_type_for_path).and_return('test')
|
53
|
+
Textractor.register_content_type('test') do |path|
|
54
|
+
path
|
55
|
+
end
|
56
|
+
|
57
|
+
Textractor.text_from_path('document').should == 'document'
|
58
|
+
end
|
59
|
+
|
50
60
|
end
|
51
61
|
|
52
62
|
describe ".extractor_for_content_type" do
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 4
|
10
|
+
version: 0.1.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Michael Guterl
|
@@ -71,14 +71,13 @@ files:
|
|
71
71
|
- bin/textractor
|
72
72
|
- lib/textractor.rb
|
73
73
|
- lib/textractor/content_type_detector.rb
|
74
|
+
- lib/textractor/content_type_detector/mimetype_fu.rb
|
75
|
+
- lib/textractor/content_type_detector/simple.rb
|
74
76
|
- lib/textractor/extractors.rb
|
75
77
|
- lib/textractor/extractors/doc_extractor.rb
|
76
78
|
- lib/textractor/extractors/docx_extractor.rb
|
77
79
|
- lib/textractor/extractors/pdf_extractor.rb
|
78
80
|
- lib/textractor/extractors/text_extractor.rb
|
79
|
-
- lib/textractor/extractors/word_extractor.rb
|
80
|
-
- lib/textractor/mimetype_fu_content_type_detector.rb
|
81
|
-
- lib/textractor/simple_content_type_detector.rb
|
82
81
|
- lib/textractor/version.rb
|
83
82
|
- spec/fixtures/document.doc
|
84
83
|
- spec/fixtures/document.docx
|
@@ -1,29 +0,0 @@
|
|
1
|
-
module Textractor::Extractors
|
2
|
-
|
3
|
-
class WordExtractor
|
4
|
-
|
5
|
-
DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../support/wvText.xml").freeze
|
6
|
-
|
7
|
-
class << self
|
8
|
-
attr_writer :wvText_path
|
9
|
-
|
10
|
-
def wvText_path
|
11
|
-
@wvText_path || DEFAULT_WV_TEXT_PATH
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
def text_from_path(path)
|
16
|
-
command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
|
17
|
-
puts command if $DEBUG
|
18
|
-
`#{command}`.strip
|
19
|
-
end
|
20
|
-
|
21
|
-
private
|
22
|
-
|
23
|
-
def wvText_path
|
24
|
-
self.class.wvText_path
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|