textractor 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,21 +1,23 @@
1
1
  # textractor
2
2
 
3
- textractor is a ruby library that provides a simple wrapper for
4
- extracting text from PDF and Word documents.
3
+ textractor is a ruby library that provides a simple wrapper around CLI
4
+ tools for extracting text from PDF and Word documents.
5
5
 
6
6
  ## Setup
7
7
 
8
+ gem install textractor
9
+
8
10
  In order to use textractor you have to install a few command line
9
11
  tools.
10
12
 
11
13
  ### OS X
12
14
 
13
- port install wv pdftohtml links
15
+ port install wv xpdf links
14
16
 
15
17
  I recommend using also passing +no_x11 to the install command, but
16
18
  this may not work on all systems due to dependency issues.
17
19
 
18
- port install wv pdftohtml links +no_x11
20
+ port install wv xpdf links +no_x11
19
21
 
20
22
  ### Ubuntu 8.04
21
23
 
data/Rakefile CHANGED
@@ -5,12 +5,12 @@ begin
5
5
  require 'jeweler'
6
6
  Jeweler::Tasks.new do |gem|
7
7
  gem.name = "textractor"
8
- gem.summary = %Q{simple wrapper for extracting text from PDF and Word documents}
9
- gem.description = %Q{simple wrapper for extracting text from PDF and Word documents}
8
+ gem.summary = %Q{simple wrapper around CLI tools for extracting text from PDF and Word documents}
9
+ gem.description = %Q{simple wrapper around CLI for extracting text from PDF and Word documents}
10
10
  gem.email = "mguterl@gmail.com"
11
11
  gem.homepage = "http://github.com/mguterl/textractor"
12
12
  gem.authors = ["Michael Guterl"]
13
- gem.add_development_dependency "rspec", ">= 1.2.9"
13
+ gem.add_development_dependency "rspec", ">= 1.3.0"
14
14
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
15
  end
16
16
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.1
1
+ 0.0.2
@@ -7,6 +7,7 @@ module Textractor
7
7
  'application/x-pdf' => :pdf,
8
8
  'application/doc' => :word,
9
9
  'application/x-doc' => :word,
10
+ 'text/plain' => :txt
10
11
  }
11
12
 
12
13
  attr_reader :filename
@@ -27,6 +28,8 @@ module Textractor
27
28
  :pdf
28
29
  when /doc/
29
30
  :word
31
+ when /txt/
32
+ :txt
30
33
  else
31
34
  nil
32
35
  end
@@ -46,6 +49,10 @@ module Textractor
46
49
  `wvWare -c utf-8 --nographics -x #{Textractor.wvText_path} #{filename} 2>/dev/null`.strip
47
50
  end
48
51
 
52
+ def extract_from_txt
53
+ File.read(filename)
54
+ end
55
+
49
56
  end
50
57
 
51
58
  end
@@ -4,6 +4,7 @@ describe Textractor::Document do
4
4
 
5
5
  PDF_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.pdf")
6
6
  WORD_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.doc")
7
+ TXT_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.txt")
7
8
 
8
9
  it 'should require a filename to create' do
9
10
  expect { Textractor::Document.new }.to raise_error(ArgumentError)
@@ -30,6 +31,15 @@ describe Textractor::Document do
30
31
 
31
32
  end
32
33
 
34
+ describe "with txt document" do
35
+
36
+ it 'should extract the text from the document' do
37
+ @doc = Textractor::Document.new(TXT_DOCUMENT_FIXTURE)
38
+ @doc.text.should == "Ruby on rails developer"
39
+ end
40
+
41
+ end
42
+
33
43
  end
34
44
 
35
45
  describe "#type" do
@@ -0,0 +1 @@
1
+ Ruby on rails developer
@@ -0,0 +1,62 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{textractor}
8
+ s.version = "0.0.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Michael Guterl"]
12
+ s.date = %q{2010-07-26}
13
+ s.description = %q{simple wrapper around CLI for extracting text from PDF and Word documents}
14
+ s.email = %q{mguterl@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.md",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "lib/textractor.rb",
27
+ "lib/textractor/document.rb",
28
+ "spec/document_spec.rb",
29
+ "spec/fixtures/document.doc",
30
+ "spec/fixtures/document.pdf",
31
+ "spec/fixtures/document.txt",
32
+ "spec/spec.opts",
33
+ "spec/spec_helper.rb",
34
+ "spec/textractor_spec.rb",
35
+ "support/wvText.xml",
36
+ "textractor.gemspec"
37
+ ]
38
+ s.homepage = %q{http://github.com/mguterl/textractor}
39
+ s.rdoc_options = ["--charset=UTF-8"]
40
+ s.require_paths = ["lib"]
41
+ s.rubygems_version = %q{1.3.7}
42
+ s.summary = %q{simple wrapper around CLI tools for extracting text from PDF and Word documents}
43
+ s.test_files = [
44
+ "spec/document_spec.rb",
45
+ "spec/spec_helper.rb",
46
+ "spec/textractor_spec.rb"
47
+ ]
48
+
49
+ if s.respond_to? :specification_version then
50
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
51
+ s.specification_version = 3
52
+
53
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
54
+ s.add_development_dependency(%q<rspec>, [">= 1.3.0"])
55
+ else
56
+ s.add_dependency(%q<rspec>, [">= 1.3.0"])
57
+ end
58
+ else
59
+ s.add_dependency(%q<rspec>, [">= 1.3.0"])
60
+ end
61
+ end
62
+
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textractor
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 27
4
5
  prerelease: false
5
6
  segments:
6
7
  - 0
7
8
  - 0
8
- - 1
9
- version: 0.0.1
9
+ - 2
10
+ version: 0.0.2
10
11
  platform: ruby
11
12
  authors:
12
13
  - Michael Guterl
@@ -14,24 +15,26 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2010-04-20 00:00:00 -04:00
18
+ date: 2010-07-26 00:00:00 -04:00
18
19
  default_executable:
19
20
  dependencies:
20
21
  - !ruby/object:Gem::Dependency
21
22
  name: rspec
22
23
  prerelease: false
23
24
  requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
24
26
  requirements:
25
27
  - - ">="
26
28
  - !ruby/object:Gem::Version
29
+ hash: 27
27
30
  segments:
28
31
  - 1
29
- - 2
30
- - 9
31
- version: 1.2.9
32
+ - 3
33
+ - 0
34
+ version: 1.3.0
32
35
  type: :development
33
36
  version_requirements: *id001
34
- description: simple wrapper for extracting text from PDF and Word documents
37
+ description: simple wrapper around CLI for extracting text from PDF and Word documents
35
38
  email: mguterl@gmail.com
36
39
  executables: []
37
40
 
@@ -52,10 +55,12 @@ files:
52
55
  - spec/document_spec.rb
53
56
  - spec/fixtures/document.doc
54
57
  - spec/fixtures/document.pdf
58
+ - spec/fixtures/document.txt
55
59
  - spec/spec.opts
56
60
  - spec/spec_helper.rb
57
61
  - spec/textractor_spec.rb
58
62
  - support/wvText.xml
63
+ - textractor.gemspec
59
64
  has_rdoc: true
60
65
  homepage: http://github.com/mguterl/textractor
61
66
  licenses: []
@@ -66,26 +71,30 @@ rdoc_options:
66
71
  require_paths:
67
72
  - lib
68
73
  required_ruby_version: !ruby/object:Gem::Requirement
74
+ none: false
69
75
  requirements:
70
76
  - - ">="
71
77
  - !ruby/object:Gem::Version
78
+ hash: 3
72
79
  segments:
73
80
  - 0
74
81
  version: "0"
75
82
  required_rubygems_version: !ruby/object:Gem::Requirement
83
+ none: false
76
84
  requirements:
77
85
  - - ">="
78
86
  - !ruby/object:Gem::Version
87
+ hash: 3
79
88
  segments:
80
89
  - 0
81
90
  version: "0"
82
91
  requirements: []
83
92
 
84
93
  rubyforge_project:
85
- rubygems_version: 1.3.6
94
+ rubygems_version: 1.3.7
86
95
  signing_key:
87
96
  specification_version: 3
88
- summary: simple wrapper for extracting text from PDF and Word documents
97
+ summary: simple wrapper around CLI tools for extracting text from PDF and Word documents
89
98
  test_files:
90
99
  - spec/document_spec.rb
91
100
  - spec/spec_helper.rb