rika 1.0.0-java → 1.1.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -17,3 +17,5 @@ test/version_tmp
17
17
  tmp
18
18
 
19
19
  .DS_Store
20
+ projectFilesBackup/
21
+ .idea/
data/README.md CHANGED
@@ -25,42 +25,60 @@ Or install it yourself as:
25
25
 
26
26
  ## Usage
27
27
 
28
- Something like this:
28
+ For a quick start with the simplest use cases, the following functions
29
+ are provided to get what you need in a single function call, for your convenience:
30
+
31
+ ```ruby
32
+ require 'rika'
33
+
34
+ content = Rika.parse_content('document.pdf') # string containing all content text
35
+ metadata = Rika.parse_metadata('document.pdf') # hash containing the document metadata
36
+ content, metadata = Rika.parse_content_and_metadata('document.pdf') # both of the above
37
+ ```
38
+
39
+ For other use cases and finer control, you can work directly with the Rika::Parser object:
40
+
29
41
  ```ruby
30
- require 'rika'
42
+ require 'rika'
31
43
 
32
- parser = Rika::Parser.new('document.pdf')
44
+ parser = Rika::Parser.new('document.pdf')
33
45
 
34
- # Return the content of the document:
35
- parser.content
46
+ # Return the content of the document:
47
+ parser.content
36
48
 
37
- # Return the media type for the document:
38
- parser.media_type
39
- => "application/pdf"
49
+ # Return the media type for the document:
50
+ parser.media_type
51
+ => "application/pdf"
40
52
 
41
- # Return the metadata field title if it exists:
42
- parser.metadata["title"] if parser.metadata_exists?("title")
53
+ # Return the metadata field title if it exists:
54
+ parser.metadata["title"] if parser.metadata_exists?("title")
43
55
 
44
- # Return all the available metadata keys that can be read from the document
45
- parser.available_metadata
56
+ # Return all the available metadata keys that can be read from the document
57
+ parser.available_metadata
46
58
 
47
- # Return only the first 10000 chars of the content:
48
- parser = Rika::Parser.new('document.pdf', 10000)
49
- parser.content # 10000 first chars returned
59
+ # Return only the first 10000 chars of the content:
60
+ parser = Rika::Parser.new('document.pdf', 10000)
61
+ parser.content # 10000 first chars returned
50
62
 
51
- # Return content from URL
52
- parser = Rika::Parser.new('http://riakhandbook.com/sample.pdf', 200)
53
- parser.content
63
+ # Return content from URL
64
+ parser = Rika::Parser.new('http://riakhandbook.com/sample.pdf', 200)
65
+ parser.content
54
66
 
55
- # Return the language for the content
56
- parser = parser = Rika::Parser.new('german document.pdf')
57
- parser.language
58
- => "de"
67
+ # Return the language for the content
68
+ parser = parser = Rika::Parser.new('german document.pdf')
69
+ parser.language
70
+ => "de"
59
71
 
60
- # Check whether the langugage identification is certain enough to be trusted
61
- parser.language_is_reasonably_certain?
72
+ # Check whether the langugage identification is certain enough to be trusted
73
+ parser.language_is_reasonably_certain?
62
74
 
63
75
  ```
76
+
77
+ ## Credits
78
+ The following people have contributed ideas, documentation, or code to Rika:
79
+ * Keith Bennett
80
+ * Richard Nyström
81
+
64
82
  ## Contributing
65
83
 
66
84
  1. Fork it
@@ -18,7 +18,23 @@ module Rika
18
18
  import org.apache.tika.language.LanguageIdentifier
19
19
  import java.io.FileInputStream
20
20
  import java.net.URL
21
-
21
+
22
+ def self.parse_content_and_metadata(file_location, max_content_length = -1)
23
+ parser = Parser.new(file_location, max_content_length)
24
+ [parser.content, parser.metadata]
25
+ end
26
+
27
+ def self.parse_content(file_location, max_content_length = -1)
28
+ parser = Parser.new(file_location, max_content_length)
29
+ parser.content
30
+ end
31
+
32
+ def self.parse_metadata(file_location)
33
+ parser = Parser.new(file_location, 0)
34
+ parser.metadata
35
+ end
36
+
37
+
22
38
  class Parser
23
39
 
24
40
  def initialize(file_location, max_content_length = -1)
@@ -1,3 +1,3 @@
1
1
  module Rika
2
- VERSION = "1.0.0"
2
+ VERSION = "1.1.1"
3
3
  end
@@ -21,6 +21,7 @@ describe Rika::Parser do
21
21
  :AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
22
22
  @server.start
23
23
  end
24
+ @sample_pdf_filespec = file_path("document.pdf")
24
25
  end
25
26
 
26
27
  after(:all) do
@@ -173,4 +174,20 @@ describe Rika::Parser do
173
174
  lang.language_is_reasonably_certain? == true
174
175
  end
175
176
  end
176
- end
177
+
178
+ it "should return valid content using Rika.parse_content" do
179
+ content = Rika.parse_content(@sample_pdf_filespec)
180
+ (content.should be_a(String)) && (content.should_not be_empty)
181
+ end
182
+
183
+ it "should return valid metadata using Rika.parse_metadata" do
184
+ metadata = Rika.parse_metadata(@sample_pdf_filespec)
185
+ (metadata.should be_a(Hash)) && (metadata.should_not be_empty)
186
+ end
187
+
188
+ it "should return valid content and metadata using Rika.parse_content_and_metadata" do
189
+ content, metadata = Rika.parse_content_and_metadata(@sample_pdf_filespec)
190
+ (content.should be_a(String)) && (content.should_not be_empty) && \
191
+ (metadata.should be_a(Hash)) && (metadata.should_not be_empty)
192
+ end
193
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rika
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.1
5
5
  prerelease:
6
6
  platform: java
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-23 00:00:00.000000000 Z
12
+ date: 2013-02-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -43,17 +43,17 @@ dependencies:
43
43
  none: false
44
44
  prerelease: false
45
45
  type: :development
46
- description: ! ' A JRuby wrapper for Apache Tika to extract text and metadata from
47
- various file formats. '
46
+ description: " A JRuby wrapper for Apache Tika to extract text and metadata from various\
47
+ \ file formats. "
48
48
  email:
49
49
  - ricny046@gmail.com
50
50
  executables: []
51
51
  extensions: []
52
52
  extra_rdoc_files: []
53
53
  files:
54
- - .gitignore
55
- - .rspec
56
- - .travis.yml
54
+ - ".gitignore"
55
+ - ".rspec"
56
+ - ".travis.yml"
57
57
  - Gemfile
58
58
  - LICENSE.txt
59
59
  - README.md
@@ -123,14 +123,14 @@ require_paths:
123
123
  - lib
124
124
  required_ruby_version: !ruby/object:Gem::Requirement
125
125
  requirements:
126
- - - ! '>='
126
+ - - ">="
127
127
  - !ruby/object:Gem::Version
128
128
  version: !binary |-
129
129
  MA==
130
130
  none: false
131
131
  required_rubygems_version: !ruby/object:Gem::Requirement
132
132
  requirements:
133
- - - ! '>='
133
+ - - ">="
134
134
  - !ruby/object:Gem::Version
135
135
  version: !binary |-
136
136
  MA==