textractor 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- textractor (0.1.3)
4
+ textractor (0.1.4)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
data/README.md CHANGED
@@ -20,7 +20,11 @@ I recommend using also passing +no_x11 to the install command, but this may not
20
20
 
21
21
  apt-get install wv xpdf-utils links
22
22
 
23
- ### Optional mimetype-fu
23
+ ### Perl (*sigh*)
24
+
25
+ Yes, this is slightly ridiculous, but a working perl installation is required in order to extract text from a docx file.
26
+
27
+ ### mimetype-fu (optional)
24
28
 
25
29
  gem install mimetype-fu
26
30
 
@@ -53,6 +57,14 @@ It's possible to define additional extractors for additional content types. An
53
57
 
54
58
  Textractor.register_content_type("text/html", HTMLExtractor)
55
59
 
60
+ It is also possible to use a block as a simple content type extractor:
61
+
62
+ Textractor.register_content_type("text/html") do |path|
63
+ data = File.read(path)
64
+ document = Nokogiri::HTML(data)
65
+ document.text
66
+ end
67
+
56
68
  You can also remove a content type extractor:
57
69
 
58
70
  Textractor.remove_content_type("text/html")
@@ -5,17 +5,18 @@ module Textractor
5
5
  ContentTypeAlreadyRegistered = Class.new(StandardError)
6
6
  ContentTypeNotRegistered = Class.new(StandardError)
7
7
 
8
- autoload :Extractors, 'textractor/extractors'
9
- autoload :SimpleContentTypeDetector, 'textractor/simple_content_type_detector'
10
- autoload :MimetypeFuContentTypeDetector, 'textractor/mimetype_fu_content_type_detector'
8
+ autoload :Extractors, 'textractor/extractors'
11
9
 
12
10
  def self.text_from_path(path, options = {})
13
11
  raise FileNotFound unless File.exists?(path)
14
12
  content_type = options.fetch(:content_type) { content_type_for_path(path) }
15
- extractor_class = extractor_for_content_type(content_type)
16
- extractor = extractor_class.new
13
+ extractor = extractor_for_content_type(content_type)
17
14
 
18
- extractor.text_from_path(path)
15
+ if extractor.is_a?(Proc)
16
+ extractor.call(path)
17
+ else
18
+ extractor.new.text_from_path(path)
19
+ end
19
20
  end
20
21
 
21
22
  class << self
@@ -26,9 +27,13 @@ module Textractor
26
27
  content_type_detector.content_type_for_path(path) or raise UnknownContentType, "unable to determine content type for #{path}"
27
28
  end
28
29
 
29
- def self.register_content_type(content_type, extractor)
30
+ def self.register_content_type(content_type, extractor = nil, &block)
30
31
  raise ContentTypeAlreadyRegistered, "#{content_type} is already registered" if extractors[content_type]
31
- extractors[content_type] = extractor
32
+ if extractor
33
+ extractors[content_type] = extractor
34
+ elsif block_given?
35
+ extractors[content_type] = block
36
+ end
32
37
  end
33
38
 
34
39
  def self.remove_content_type(content_type)
@@ -1,9 +1,14 @@
1
+ module Textractor::ContentTypeDetector
2
+ autoload :Simple, 'textractor/content_type_detector/simple'
3
+ autoload :MimetypeFu, 'textractor/content_type_detector/mimetype_fu'
4
+ end
5
+
1
6
  begin
2
7
  require 'rubygems'
3
8
  require 'yaml'
4
9
  require 'mimetype_fu'
5
10
 
6
- Textractor.content_type_detector = Textractor::MimetypeFuContentTypeDetector
11
+ Textractor.content_type_detector = Textractor::ContentTypeDetector::MimetypeFu
7
12
  rescue LoadError => e
8
- Textractor.content_type_detector = Textractor::SimpleContentTypeDetector
13
+ Textractor.content_type_detector = Textractor::ContentTypeDetector::Simple
9
14
  end
@@ -1,6 +1,6 @@
1
- module Textractor
1
+ module Textractor::ContentTypeDetector
2
2
 
3
- class MimetypeFuContentTypeDetector
3
+ class MimetypeFu
4
4
 
5
5
  def self.content_type_for_path(path)
6
6
  File.mime_type?(path)
@@ -1,6 +1,6 @@
1
- module Textractor
1
+ module Textractor::ContentTypeDetector
2
2
 
3
- class SimpleContentTypeDetector
3
+ class Simple
4
4
 
5
5
  def self.content_type_for_path(path)
6
6
  case File.extname(path)
@@ -1,3 +1,3 @@
1
1
  module Textractor
2
- VERSION = '0.1.3'
2
+ VERSION = '0.1.4'
3
3
  end
@@ -47,6 +47,16 @@ describe Textractor do
47
47
  }.to raise_error(Textractor::ContentTypeAlreadyRegistered)
48
48
  end
49
49
 
50
+ it 'takes a block for simple cases' do
51
+ File.stub(:exists?).and_return(true)
52
+ Textractor.stub(:content_type_for_path).and_return('test')
53
+ Textractor.register_content_type('test') do |path|
54
+ path
55
+ end
56
+
57
+ Textractor.text_from_path('document').should == 'document'
58
+ end
59
+
50
60
  end
51
61
 
52
62
  describe ".extractor_for_content_type" do
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textractor
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 3
10
- version: 0.1.3
9
+ - 4
10
+ version: 0.1.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Michael Guterl
@@ -71,14 +71,13 @@ files:
71
71
  - bin/textractor
72
72
  - lib/textractor.rb
73
73
  - lib/textractor/content_type_detector.rb
74
+ - lib/textractor/content_type_detector/mimetype_fu.rb
75
+ - lib/textractor/content_type_detector/simple.rb
74
76
  - lib/textractor/extractors.rb
75
77
  - lib/textractor/extractors/doc_extractor.rb
76
78
  - lib/textractor/extractors/docx_extractor.rb
77
79
  - lib/textractor/extractors/pdf_extractor.rb
78
80
  - lib/textractor/extractors/text_extractor.rb
79
- - lib/textractor/extractors/word_extractor.rb
80
- - lib/textractor/mimetype_fu_content_type_detector.rb
81
- - lib/textractor/simple_content_type_detector.rb
82
81
  - lib/textractor/version.rb
83
82
  - spec/fixtures/document.doc
84
83
  - spec/fixtures/document.docx
@@ -1,29 +0,0 @@
1
- module Textractor::Extractors
2
-
3
- class WordExtractor
4
-
5
- DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../support/wvText.xml").freeze
6
-
7
- class << self
8
- attr_writer :wvText_path
9
-
10
- def wvText_path
11
- @wvText_path || DEFAULT_WV_TEXT_PATH
12
- end
13
- end
14
-
15
- def text_from_path(path)
16
- command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
17
- puts command if $DEBUG
18
- `#{command}`.strip
19
- end
20
-
21
- private
22
-
23
- def wvText_path
24
- self.class.wvText_path
25
- end
26
-
27
- end
28
-
29
- end