textractor 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,21 +1,23 @@
1
1
  # textractor
2
2
 
3
- textractor is a ruby library that provides a simple wrapper for
4
- extracting text from PDF and Word documents.
3
+ textractor is a ruby library that provides a simple wrapper around CLI
4
+ tools for extracting text from PDF and Word documents.
5
5
 
6
6
  ## Setup
7
7
 
8
+ gem install textractor
9
+
8
10
  In order to use textractor you have to install a few command line
9
11
  tools.
10
12
 
11
13
  ### OS X
12
14
 
13
- port install wv pdftohtml links
15
+ port install wv xpdf links
14
16
 
15
17
  I recommend using also passing +no_x11 to the install command, but
16
18
  this may not work on all systems due to dependency issues.
17
19
 
18
- port install wv pdftohtml links +no_x11
20
+ port install wv xpdf links +no_x11
19
21
 
20
22
  ### Ubuntu 8.04
21
23
 
data/Rakefile CHANGED
@@ -5,12 +5,12 @@ begin
5
5
  require 'jeweler'
6
6
  Jeweler::Tasks.new do |gem|
7
7
  gem.name = "textractor"
8
- gem.summary = %Q{simple wrapper for extracting text from PDF and Word documents}
9
- gem.description = %Q{simple wrapper for extracting text from PDF and Word documents}
8
+ gem.summary = %Q{simple wrapper around CLI tools for extracting text from PDF and Word documents}
9
+ gem.description = %Q{simple wrapper around CLI for extracting text from PDF and Word documents}
10
10
  gem.email = "mguterl@gmail.com"
11
11
  gem.homepage = "http://github.com/mguterl/textractor"
12
12
  gem.authors = ["Michael Guterl"]
13
- gem.add_development_dependency "rspec", ">= 1.2.9"
13
+ gem.add_development_dependency "rspec", ">= 1.3.0"
14
14
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
15
  end
16
16
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.1
1
+ 0.0.2
@@ -7,6 +7,7 @@ module Textractor
7
7
  'application/x-pdf' => :pdf,
8
8
  'application/doc' => :word,
9
9
  'application/x-doc' => :word,
10
+ 'text/plain' => :txt
10
11
  }
11
12
 
12
13
  attr_reader :filename
@@ -27,6 +28,8 @@ module Textractor
27
28
  :pdf
28
29
  when /doc/
29
30
  :word
31
+ when /txt/
32
+ :txt
30
33
  else
31
34
  nil
32
35
  end
@@ -46,6 +49,10 @@ module Textractor
46
49
  `wvWare -c utf-8 --nographics -x #{Textractor.wvText_path} #{filename} 2>/dev/null`.strip
47
50
  end
48
51
 
52
+ def extract_from_txt
53
+ File.read(filename)
54
+ end
55
+
49
56
  end
50
57
 
51
58
  end
@@ -4,6 +4,7 @@ describe Textractor::Document do
4
4
 
5
5
  PDF_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.pdf")
6
6
  WORD_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.doc")
7
+ TXT_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.txt")
7
8
 
8
9
  it 'should require a filename to create' do
9
10
  expect { Textractor::Document.new }.to raise_error(ArgumentError)
@@ -30,6 +31,15 @@ describe Textractor::Document do
30
31
 
31
32
  end
32
33
 
34
+ describe "with txt document" do
35
+
36
+ it 'should extract the text from the document' do
37
+ @doc = Textractor::Document.new(TXT_DOCUMENT_FIXTURE)
38
+ @doc.text.should == "Ruby on rails developer"
39
+ end
40
+
41
+ end
42
+
33
43
  end
34
44
 
35
45
  describe "#type" do
@@ -0,0 +1 @@
1
+ Ruby on rails developer
@@ -0,0 +1,62 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{textractor}
8
+ s.version = "0.0.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Michael Guterl"]
12
+ s.date = %q{2010-07-26}
13
+ s.description = %q{simple wrapper around CLI for extracting text from PDF and Word documents}
14
+ s.email = %q{mguterl@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.md",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "lib/textractor.rb",
27
+ "lib/textractor/document.rb",
28
+ "spec/document_spec.rb",
29
+ "spec/fixtures/document.doc",
30
+ "spec/fixtures/document.pdf",
31
+ "spec/fixtures/document.txt",
32
+ "spec/spec.opts",
33
+ "spec/spec_helper.rb",
34
+ "spec/textractor_spec.rb",
35
+ "support/wvText.xml",
36
+ "textractor.gemspec"
37
+ ]
38
+ s.homepage = %q{http://github.com/mguterl/textractor}
39
+ s.rdoc_options = ["--charset=UTF-8"]
40
+ s.require_paths = ["lib"]
41
+ s.rubygems_version = %q{1.3.7}
42
+ s.summary = %q{simple wrapper around CLI tools for extracting text from PDF and Word documents}
43
+ s.test_files = [
44
+ "spec/document_spec.rb",
45
+ "spec/spec_helper.rb",
46
+ "spec/textractor_spec.rb"
47
+ ]
48
+
49
+ if s.respond_to? :specification_version then
50
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
51
+ s.specification_version = 3
52
+
53
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
54
+ s.add_development_dependency(%q<rspec>, [">= 1.3.0"])
55
+ else
56
+ s.add_dependency(%q<rspec>, [">= 1.3.0"])
57
+ end
58
+ else
59
+ s.add_dependency(%q<rspec>, [">= 1.3.0"])
60
+ end
61
+ end
62
+
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textractor
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 27
4
5
  prerelease: false
5
6
  segments:
6
7
  - 0
7
8
  - 0
8
- - 1
9
- version: 0.0.1
9
+ - 2
10
+ version: 0.0.2
10
11
  platform: ruby
11
12
  authors:
12
13
  - Michael Guterl
@@ -14,24 +15,26 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2010-04-20 00:00:00 -04:00
18
+ date: 2010-07-26 00:00:00 -04:00
18
19
  default_executable:
19
20
  dependencies:
20
21
  - !ruby/object:Gem::Dependency
21
22
  name: rspec
22
23
  prerelease: false
23
24
  requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
24
26
  requirements:
25
27
  - - ">="
26
28
  - !ruby/object:Gem::Version
29
+ hash: 27
27
30
  segments:
28
31
  - 1
29
- - 2
30
- - 9
31
- version: 1.2.9
32
+ - 3
33
+ - 0
34
+ version: 1.3.0
32
35
  type: :development
33
36
  version_requirements: *id001
34
- description: simple wrapper for extracting text from PDF and Word documents
37
+ description: simple wrapper around CLI for extracting text from PDF and Word documents
35
38
  email: mguterl@gmail.com
36
39
  executables: []
37
40
 
@@ -52,10 +55,12 @@ files:
52
55
  - spec/document_spec.rb
53
56
  - spec/fixtures/document.doc
54
57
  - spec/fixtures/document.pdf
58
+ - spec/fixtures/document.txt
55
59
  - spec/spec.opts
56
60
  - spec/spec_helper.rb
57
61
  - spec/textractor_spec.rb
58
62
  - support/wvText.xml
63
+ - textractor.gemspec
59
64
  has_rdoc: true
60
65
  homepage: http://github.com/mguterl/textractor
61
66
  licenses: []
@@ -66,26 +71,30 @@ rdoc_options:
66
71
  require_paths:
67
72
  - lib
68
73
  required_ruby_version: !ruby/object:Gem::Requirement
74
+ none: false
69
75
  requirements:
70
76
  - - ">="
71
77
  - !ruby/object:Gem::Version
78
+ hash: 3
72
79
  segments:
73
80
  - 0
74
81
  version: "0"
75
82
  required_rubygems_version: !ruby/object:Gem::Requirement
83
+ none: false
76
84
  requirements:
77
85
  - - ">="
78
86
  - !ruby/object:Gem::Version
87
+ hash: 3
79
88
  segments:
80
89
  - 0
81
90
  version: "0"
82
91
  requirements: []
83
92
 
84
93
  rubyforge_project:
85
- rubygems_version: 1.3.6
94
+ rubygems_version: 1.3.7
86
95
  signing_key:
87
96
  specification_version: 3
88
- summary: simple wrapper for extracting text from PDF and Word documents
97
+ summary: simple wrapper around CLI tools for extracting text from PDF and Word documents
89
98
  test_files:
90
99
  - spec/document_spec.rb
91
100
  - spec/spec_helper.rb