rika 1.0.0-java → 1.1.1-java

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -17,3 +17,5 @@ test/version_tmp
17
17
  tmp
18
18
 
19
19
  .DS_Store
20
+ projectFilesBackup/
21
+ .idea/
data/README.md CHANGED
@@ -25,42 +25,60 @@ Or install it yourself as:
25
25
 
26
26
  ## Usage
27
27
 
28
- Something like this:
28
+ For a quick start with the simplest use cases, the following functions
29
+ are provided to get what you need in a single function call, for your convenience:
30
+
31
+ ```ruby
32
+ require 'rika'
33
+
34
+ content = Rika.parse_content('document.pdf') # string containing all content text
35
+ metadata = Rika.parse_metadata('document.pdf') # hash containing the document metadata
36
+ content, metadata = Rika.parse_content_and_metadata('document.pdf') # both of the above
37
+ ```
38
+
39
+ For other use cases and finer control, you can work directly with the Rika::Parser object:
40
+
29
41
  ```ruby
30
- require 'rika'
42
+ require 'rika'
31
43
 
32
- parser = Rika::Parser.new('document.pdf')
44
+ parser = Rika::Parser.new('document.pdf')
33
45
 
34
- # Return the content of the document:
35
- parser.content
46
+ # Return the content of the document:
47
+ parser.content
36
48
 
37
- # Return the media type for the document:
38
- parser.media_type
39
- => "application/pdf"
49
+ # Return the media type for the document:
50
+ parser.media_type
51
+ => "application/pdf"
40
52
 
41
- # Return the metadata field title if it exists:
42
- parser.metadata["title"] if parser.metadata_exists?("title")
53
+ # Return the metadata field title if it exists:
54
+ parser.metadata["title"] if parser.metadata_exists?("title")
43
55
 
44
- # Return all the available metadata keys that can be read from the document
45
- parser.available_metadata
56
+ # Return all the available metadata keys that can be read from the document
57
+ parser.available_metadata
46
58
 
47
- # Return only the first 10000 chars of the content:
48
- parser = Rika::Parser.new('document.pdf', 10000)
49
- parser.content # 10000 first chars returned
59
+ # Return only the first 10000 chars of the content:
60
+ parser = Rika::Parser.new('document.pdf', 10000)
61
+ parser.content # 10000 first chars returned
50
62
 
51
- # Return content from URL
52
- parser = Rika::Parser.new('http://riakhandbook.com/sample.pdf', 200)
53
- parser.content
63
+ # Return content from URL
64
+ parser = Rika::Parser.new('http://riakhandbook.com/sample.pdf', 200)
65
+ parser.content
54
66
 
55
- # Return the language for the content
56
- parser = parser = Rika::Parser.new('german document.pdf')
57
- parser.language
58
- => "de"
67
+ # Return the language for the content
68
+ parser = parser = Rika::Parser.new('german document.pdf')
69
+ parser.language
70
+ => "de"
59
71
 
60
- # Check whether the langugage identification is certain enough to be trusted
61
- parser.language_is_reasonably_certain?
72
+ # Check whether the langugage identification is certain enough to be trusted
73
+ parser.language_is_reasonably_certain?
62
74
 
63
75
  ```
76
+
77
+ ## Credits
78
+ The following people have contributed ideas, documentation, or code to Rika:
79
+ * Keith Bennett
80
+ * Richard Nyström
81
+
64
82
  ## Contributing
65
83
 
66
84
  1. Fork it
@@ -18,7 +18,23 @@ module Rika
18
18
  import org.apache.tika.language.LanguageIdentifier
19
19
  import java.io.FileInputStream
20
20
  import java.net.URL
21
-
21
+
22
+ def self.parse_content_and_metadata(file_location, max_content_length = -1)
23
+ parser = Parser.new(file_location, max_content_length)
24
+ [parser.content, parser.metadata]
25
+ end
26
+
27
+ def self.parse_content(file_location, max_content_length = -1)
28
+ parser = Parser.new(file_location, max_content_length)
29
+ parser.content
30
+ end
31
+
32
+ def self.parse_metadata(file_location)
33
+ parser = Parser.new(file_location, 0)
34
+ parser.metadata
35
+ end
36
+
37
+
22
38
  class Parser
23
39
 
24
40
  def initialize(file_location, max_content_length = -1)
@@ -1,3 +1,3 @@
1
1
  module Rika
2
- VERSION = "1.0.0"
2
+ VERSION = "1.1.1"
3
3
  end
@@ -21,6 +21,7 @@ describe Rika::Parser do
21
21
  :AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
22
22
  @server.start
23
23
  end
24
+ @sample_pdf_filespec = file_path("document.pdf")
24
25
  end
25
26
 
26
27
  after(:all) do
@@ -173,4 +174,20 @@ describe Rika::Parser do
173
174
  lang.language_is_reasonably_certain? == true
174
175
  end
175
176
  end
176
- end
177
+
178
+ it "should return valid content using Rika.parse_content" do
179
+ content = Rika.parse_content(@sample_pdf_filespec)
180
+ (content.should be_a(String)) && (content.should_not be_empty)
181
+ end
182
+
183
+ it "should return valid metadata using Rika.parse_metadata" do
184
+ metadata = Rika.parse_metadata(@sample_pdf_filespec)
185
+ (metadata.should be_a(Hash)) && (metadata.should_not be_empty)
186
+ end
187
+
188
+ it "should return valid content and metadata using Rika.parse_content_and_metadata" do
189
+ content, metadata = Rika.parse_content_and_metadata(@sample_pdf_filespec)
190
+ (content.should be_a(String)) && (content.should_not be_empty) && \
191
+ (metadata.should be_a(Hash)) && (metadata.should_not be_empty)
192
+ end
193
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rika
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.1
5
5
  prerelease:
6
6
  platform: java
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-23 00:00:00.000000000 Z
12
+ date: 2013-02-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -43,17 +43,17 @@ dependencies:
43
43
  none: false
44
44
  prerelease: false
45
45
  type: :development
46
- description: ! ' A JRuby wrapper for Apache Tika to extract text and metadata from
47
- various file formats. '
46
+ description: " A JRuby wrapper for Apache Tika to extract text and metadata from various\
47
+ \ file formats. "
48
48
  email:
49
49
  - ricny046@gmail.com
50
50
  executables: []
51
51
  extensions: []
52
52
  extra_rdoc_files: []
53
53
  files:
54
- - .gitignore
55
- - .rspec
56
- - .travis.yml
54
+ - ".gitignore"
55
+ - ".rspec"
56
+ - ".travis.yml"
57
57
  - Gemfile
58
58
  - LICENSE.txt
59
59
  - README.md
@@ -123,14 +123,14 @@ require_paths:
123
123
  - lib
124
124
  required_ruby_version: !ruby/object:Gem::Requirement
125
125
  requirements:
126
- - - ! '>='
126
+ - - ">="
127
127
  - !ruby/object:Gem::Version
128
128
  version: !binary |-
129
129
  MA==
130
130
  none: false
131
131
  required_rubygems_version: !ruby/object:Gem::Requirement
132
132
  requirements:
133
- - - ! '>='
133
+ - - ">="
134
134
  - !ruby/object:Gem::Version
135
135
  version: !binary |-
136
136
  MA==