rika 0.9.2-java → 0.9.3-java
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +5 -1
- data/lib/rika.rb +29 -22
- data/lib/rika/version.rb +1 -1
- data/spec/rika_spec.rb +36 -1
- data/spec/spec_helper.rb +1 -0
- metadata +2 -2
data/README.md
CHANGED
@@ -39,7 +39,11 @@ Something like this:
|
|
39
39
|
|
40
40
|
# Return only the first 10000 chars of the content:
|
41
41
|
parser = Rika::Parser.new('document.pdf', 10000)
|
42
|
-
parser.content # 10000 first chars returned
|
42
|
+
parser.content # 10000 first chars returned
|
43
|
+
|
44
|
+
# Return content from URL
|
45
|
+
parser = Rika::Parser.new('http://www.exampleurl.com/example.pdf')
|
46
|
+
parser.content
|
43
47
|
```
|
44
48
|
## Contributing
|
45
49
|
|
data/lib/rika.rb
CHANGED
@@ -1,24 +1,34 @@
|
|
1
1
|
raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
|
2
2
|
|
3
3
|
require "rika/version"
|
4
|
+
require 'uri'
|
5
|
+
require 'net/http'
|
4
6
|
require 'java'
|
5
7
|
|
6
8
|
Dir[File.join(File.dirname(__FILE__), "../target/dependency/*.jar")].each do |jar|
|
7
9
|
require jar
|
8
10
|
end
|
9
11
|
|
12
|
+
# Heavily based on the Apache Tika API: http://tika.apache.org/1.2/api/org/apache/tika/Tika.html
|
10
13
|
module Rika
|
11
14
|
import org.apache.tika.metadata.Metadata
|
12
15
|
import org.apache.tika.Tika
|
16
|
+
|
13
17
|
class Parser
|
14
18
|
|
15
|
-
def initialize(
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
19
|
+
def initialize(uri, max_content_length = -1)
|
20
|
+
p = URI::Parser.new
|
21
|
+
@uri = uri
|
22
|
+
@tika = Tika.new
|
23
|
+
@tika.set_max_string_length(max_content_length)
|
24
|
+
@metadata = Metadata.new
|
25
|
+
|
26
|
+
if File.exists?(@uri)
|
27
|
+
self.parse_file
|
28
|
+
elsif p.parse(@uri).scheme == 'http' || p.parse(@uri).scheme == 'https'
|
29
|
+
self.parse_url
|
20
30
|
else
|
21
|
-
raise IOError, "File does not exist"
|
31
|
+
raise IOError, "File does not exist or can't be reached."
|
22
32
|
end
|
23
33
|
end
|
24
34
|
|
@@ -41,26 +51,23 @@ module Rika
|
|
41
51
|
end
|
42
52
|
|
43
53
|
def metadata_exists?(name)
|
44
|
-
|
45
|
-
false
|
46
|
-
else
|
47
|
-
true
|
48
|
-
end
|
54
|
+
@metadata.get(name) != nil
|
49
55
|
end
|
50
56
|
|
51
57
|
protected
|
52
58
|
|
53
|
-
def
|
54
|
-
input_stream =
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
59
|
+
def parse_file
|
60
|
+
input_stream = java.io.FileInputStream.new(java.io.File.new(@uri))
|
61
|
+
@metadata.set("filename", File.basename(@uri))
|
62
|
+
@content = @tika.parse_to_string(input_stream, @metadata)
|
63
|
+
end
|
64
|
+
|
65
|
+
def parse_url
|
66
|
+
raise IOError, "File does not exist or can't be reached." if not Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
|
67
|
+
url = java.net.URL.new(@uri)
|
68
|
+
input_stream = url.open_stream
|
69
|
+
@metadata.set("url", @uri)
|
70
|
+
@content = @tika.parse_to_string(input_stream, @metadata)
|
64
71
|
end
|
65
72
|
end
|
66
73
|
end
|
data/lib/rika/version.rb
CHANGED
data/spec/rika_spec.rb
CHANGED
@@ -1,17 +1,37 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
require 'spec_helper'
|
4
|
+
require 'webrick'
|
4
5
|
|
6
|
+
include WEBrick
|
7
|
+
|
5
8
|
describe Rika::Parser do
|
6
9
|
before(:all) do
|
7
10
|
@txt_parser = Rika::Parser.new(file_path("text_file.txt"))
|
8
11
|
@docx_parser = Rika::Parser.new(file_path("document.docx"))
|
9
12
|
@pdf_parser = Rika::Parser.new(file_path("document.pdf"))
|
10
13
|
@image_parser = Rika::Parser.new(file_path("image.jpg"))
|
14
|
+
@dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
|
15
|
+
port = 50505
|
16
|
+
@url = "http://#{Socket.gethostname}:#{port}"
|
17
|
+
|
18
|
+
@t1 = Thread.new do
|
19
|
+
@server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
|
20
|
+
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
21
|
+
@server.start
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
after(:all) do
|
26
|
+
@t1.exit
|
11
27
|
end
|
12
28
|
|
13
29
|
it "should raise error if file does not exists" do
|
14
|
-
lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError, "File does not exist")
|
30
|
+
lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError, "File does not exist or can't be reached.")
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should raise error if URL does not exists" do
|
34
|
+
lambda { Rika::Parser.new("http://nonsense.com/whatever.pdf") }.should raise_error(IOError, "File does not exist or can't be reached.")
|
15
35
|
end
|
16
36
|
|
17
37
|
it "should detect file type without a file extension" do
|
@@ -41,10 +61,20 @@ describe Rika::Parser do
|
|
41
61
|
parser.content.should == "First"
|
42
62
|
end
|
43
63
|
|
64
|
+
it "should only return max content length for file over http" do
|
65
|
+
parser = Rika::Parser.new(@url + "/document.pdf", 6)
|
66
|
+
parser.content.should == "First"
|
67
|
+
end
|
68
|
+
|
44
69
|
it "should be possible to read files over 100k by default" do
|
45
70
|
parser = Rika::Parser.new(file_path("over_100k_file.txt"))
|
46
71
|
parser.content.length.should == 101_761
|
47
72
|
end
|
73
|
+
|
74
|
+
it "should return the content from a file over http" do
|
75
|
+
parser = Rika::Parser.new(@url + "/document.pdf")
|
76
|
+
parser.content.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
|
77
|
+
end
|
48
78
|
end
|
49
79
|
|
50
80
|
# We just test a few of the metadata fields for some common file formats
|
@@ -67,6 +97,11 @@ describe Rika::Parser do
|
|
67
97
|
@pdf_parser.metadata["title"].should == "A simple title"
|
68
98
|
end
|
69
99
|
|
100
|
+
it "should return metadata from a file over http" do
|
101
|
+
parser = Rika::Parser.new(@url + "/document.pdf")
|
102
|
+
parser.metadata["title"].should == "A simple title"
|
103
|
+
end
|
104
|
+
|
70
105
|
it "should return metadata from an image" do
|
71
106
|
@image_parser.metadata["Image Height"].should == "72 pixels"
|
72
107
|
@image_parser.metadata["Image Width"].should == "72 pixels"
|
data/spec/spec_helper.rb
CHANGED
@@ -3,6 +3,7 @@ require "rika"
|
|
3
3
|
def file_path( *paths )
|
4
4
|
File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
|
5
5
|
end
|
6
|
+
|
6
7
|
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
8
|
RSpec.configure do |config|
|
8
9
|
config.treat_symbols_as_metadata_keys_with_true_values = true
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.9.3
|
4
5
|
prerelease:
|
5
|
-
version: 0.9.2
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Richard Nyström
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|