textractor 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- textractor (0.1.3)
4
+ textractor (0.1.4)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
data/README.md CHANGED
@@ -20,7 +20,11 @@ I recommend using also passing +no_x11 to the install command, but this may not
20
20
 
21
21
  apt-get install wv xpdf-utils links
22
22
 
23
- ### Optional mimetype-fu
23
+ ### Perl (*sigh*)
24
+
25
+ Yes, this is slightly ridiculous, but a working perl installation is required in order to extract text from a docx file.
26
+
27
+ ### mimetype-fu (optional)
24
28
 
25
29
  gem install mimetype-fu
26
30
 
@@ -53,6 +57,14 @@ It's possible to define additional extractors for additional content types. An
53
57
 
54
58
  Textractor.register_content_type("text/html", HTMLExtractor)
55
59
 
60
+ It is also possible to use a block as a simple content type extractor:
61
+
62
+ Textractor.register_content_type("text/html") do |path|
63
+ data = File.read(path)
64
+ document = Nokogiri::HTML(data)
65
+ document.text
66
+ end
67
+
56
68
  You can also remove a content type extractor:
57
69
 
58
70
  Textractor.remove_content_type("text/html")
@@ -5,17 +5,18 @@ module Textractor
5
5
  ContentTypeAlreadyRegistered = Class.new(StandardError)
6
6
  ContentTypeNotRegistered = Class.new(StandardError)
7
7
 
8
- autoload :Extractors, 'textractor/extractors'
9
- autoload :SimpleContentTypeDetector, 'textractor/simple_content_type_detector'
10
- autoload :MimetypeFuContentTypeDetector, 'textractor/mimetype_fu_content_type_detector'
8
+ autoload :Extractors, 'textractor/extractors'
11
9
 
12
10
  def self.text_from_path(path, options = {})
13
11
  raise FileNotFound unless File.exists?(path)
14
12
  content_type = options.fetch(:content_type) { content_type_for_path(path) }
15
- extractor_class = extractor_for_content_type(content_type)
16
- extractor = extractor_class.new
13
+ extractor = extractor_for_content_type(content_type)
17
14
 
18
- extractor.text_from_path(path)
15
+ if extractor.is_a?(Proc)
16
+ extractor.call(path)
17
+ else
18
+ extractor.new.text_from_path(path)
19
+ end
19
20
  end
20
21
 
21
22
  class << self
@@ -26,9 +27,13 @@ module Textractor
26
27
  content_type_detector.content_type_for_path(path) or raise UnknownContentType, "unable to determine content type for #{path}"
27
28
  end
28
29
 
29
- def self.register_content_type(content_type, extractor)
30
+ def self.register_content_type(content_type, extractor = nil, &block)
30
31
  raise ContentTypeAlreadyRegistered, "#{content_type} is already registered" if extractors[content_type]
31
- extractors[content_type] = extractor
32
+ if extractor
33
+ extractors[content_type] = extractor
34
+ elsif block_given?
35
+ extractors[content_type] = block
36
+ end
32
37
  end
33
38
 
34
39
  def self.remove_content_type(content_type)
@@ -1,9 +1,14 @@
1
+ module Textractor::ContentTypeDetector
2
+ autoload :Simple, 'textractor/content_type_detector/simple'
3
+ autoload :MimetypeFu, 'textractor/content_type_detector/mimetype_fu'
4
+ end
5
+
1
6
  begin
2
7
  require 'rubygems'
3
8
  require 'yaml'
4
9
  require 'mimetype_fu'
5
10
 
6
- Textractor.content_type_detector = Textractor::MimetypeFuContentTypeDetector
11
+ Textractor.content_type_detector = Textractor::ContentTypeDetector::MimetypeFu
7
12
  rescue LoadError => e
8
- Textractor.content_type_detector = Textractor::SimpleContentTypeDetector
13
+ Textractor.content_type_detector = Textractor::ContentTypeDetector::Simple
9
14
  end
@@ -1,6 +1,6 @@
1
- module Textractor
1
+ module Textractor::ContentTypeDetector
2
2
 
3
- class MimetypeFuContentTypeDetector
3
+ class MimetypeFu
4
4
 
5
5
  def self.content_type_for_path(path)
6
6
  File.mime_type?(path)
@@ -1,6 +1,6 @@
1
- module Textractor
1
+ module Textractor::ContentTypeDetector
2
2
 
3
- class SimpleContentTypeDetector
3
+ class Simple
4
4
 
5
5
  def self.content_type_for_path(path)
6
6
  case File.extname(path)
@@ -1,3 +1,3 @@
1
1
  module Textractor
2
- VERSION = '0.1.3'
2
+ VERSION = '0.1.4'
3
3
  end
@@ -47,6 +47,16 @@ describe Textractor do
47
47
  }.to raise_error(Textractor::ContentTypeAlreadyRegistered)
48
48
  end
49
49
 
50
+ it 'takes a block for simple cases' do
51
+ File.stub(:exists?).and_return(true)
52
+ Textractor.stub(:content_type_for_path).and_return('test')
53
+ Textractor.register_content_type('test') do |path|
54
+ path
55
+ end
56
+
57
+ Textractor.text_from_path('document').should == 'document'
58
+ end
59
+
50
60
  end
51
61
 
52
62
  describe ".extractor_for_content_type" do
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textractor
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 3
10
- version: 0.1.3
9
+ - 4
10
+ version: 0.1.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Michael Guterl
@@ -71,14 +71,13 @@ files:
71
71
  - bin/textractor
72
72
  - lib/textractor.rb
73
73
  - lib/textractor/content_type_detector.rb
74
+ - lib/textractor/content_type_detector/mimetype_fu.rb
75
+ - lib/textractor/content_type_detector/simple.rb
74
76
  - lib/textractor/extractors.rb
75
77
  - lib/textractor/extractors/doc_extractor.rb
76
78
  - lib/textractor/extractors/docx_extractor.rb
77
79
  - lib/textractor/extractors/pdf_extractor.rb
78
80
  - lib/textractor/extractors/text_extractor.rb
79
- - lib/textractor/extractors/word_extractor.rb
80
- - lib/textractor/mimetype_fu_content_type_detector.rb
81
- - lib/textractor/simple_content_type_detector.rb
82
81
  - lib/textractor/version.rb
83
82
  - spec/fixtures/document.doc
84
83
  - spec/fixtures/document.docx
@@ -1,29 +0,0 @@
1
- module Textractor::Extractors
2
-
3
- class WordExtractor
4
-
5
- DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../support/wvText.xml").freeze
6
-
7
- class << self
8
- attr_writer :wvText_path
9
-
10
- def wvText_path
11
- @wvText_path || DEFAULT_WV_TEXT_PATH
12
- end
13
- end
14
-
15
- def text_from_path(path)
16
- command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
17
- puts command if $DEBUG
18
- `#{command}`.strip
19
- end
20
-
21
- private
22
-
23
- def wvText_path
24
- self.class.wvText_path
25
- end
26
-
27
- end
28
-
29
- end