rika 0.9.7-java → 0.9.8-java
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rika.rb +30 -22
- data/lib/rika/version.rb +1 -1
- data/spec/rika_spec.rb +8 -8
- metadata +2 -2
data/lib/rika.rb
CHANGED
@@ -11,7 +11,7 @@ Dir[File.join(File.dirname(__FILE__), "../target/dependency/*.jar")].each do |ja
|
|
11
11
|
require jar
|
12
12
|
end
|
13
13
|
|
14
|
-
# Heavily based on the Apache Tika API: http://tika.apache.org/1.
|
14
|
+
# Heavily based on the Apache Tika API: http://tika.apache.org/1.3/api/org/apache/tika/Tika.html
|
15
15
|
module Rika
|
16
16
|
import org.apache.tika.metadata.Metadata
|
17
17
|
import org.apache.tika.Tika
|
@@ -24,14 +24,9 @@ module Rika
|
|
24
24
|
@uri = file_location
|
25
25
|
@tika = Tika.new
|
26
26
|
@tika.set_max_string_length(max_content_length)
|
27
|
-
@
|
28
|
-
|
29
|
-
@
|
30
|
-
is_http = URI(@uri).scheme == "http" && Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess) if !@is_file
|
31
|
-
|
32
|
-
if !@is_file && !is_http
|
33
|
-
raise IOError, "File does not exist or can't be reached."
|
34
|
-
end
|
27
|
+
@metadata_java = Metadata.new
|
28
|
+
@metadata_ruby = nil
|
29
|
+
@input_type = get_input_type
|
35
30
|
end
|
36
31
|
|
37
32
|
def content
|
@@ -40,14 +35,15 @@ module Rika
|
|
40
35
|
end
|
41
36
|
|
42
37
|
def metadata
|
43
|
-
|
44
|
-
|
38
|
+
unless @metadata_ruby
|
39
|
+
self.parse
|
40
|
+
@metadata_ruby = {}
|
45
41
|
|
46
|
-
|
47
|
-
|
42
|
+
@metadata_java.names.each do |name|
|
43
|
+
@metadata_ruby[name] = @metadata_java.get(name)
|
44
|
+
end
|
48
45
|
end
|
49
|
-
|
50
|
-
metadata_hash
|
46
|
+
@metadata_ruby
|
51
47
|
end
|
52
48
|
|
53
49
|
def media_type
|
@@ -55,25 +51,37 @@ module Rika
|
|
55
51
|
end
|
56
52
|
|
57
53
|
def available_metadata
|
58
|
-
|
59
|
-
@metadata.names.to_a
|
54
|
+
metadata.keys
|
60
55
|
end
|
61
56
|
|
62
57
|
def metadata_exists?(name)
|
63
|
-
|
64
|
-
|
58
|
+
metadata[name] != nil
|
59
|
+
end
|
60
|
+
|
61
|
+
def file?
|
62
|
+
@input_type == :file
|
65
63
|
end
|
66
64
|
|
67
65
|
protected
|
68
66
|
|
69
67
|
def parse
|
70
|
-
@content ||= @tika.parse_to_string(input_stream, @
|
68
|
+
@content ||= @tika.parse_to_string(input_stream, @metadata_java).to_s.strip
|
69
|
+
end
|
70
|
+
|
71
|
+
def get_input_type
|
72
|
+
if File.exists?(@uri) && File.directory?(@uri) == false
|
73
|
+
:file
|
74
|
+
elsif URI(@uri).scheme == "http" && Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
|
75
|
+
:http
|
76
|
+
else
|
77
|
+
raise IOError, "Input (#{@uri}) is neither file nor http."
|
78
|
+
end
|
71
79
|
end
|
72
80
|
|
73
81
|
def input_stream
|
74
|
-
if
|
82
|
+
if file?
|
75
83
|
FileInputStream.new(java.io.File.new(@uri))
|
76
|
-
else
|
84
|
+
else # :http
|
77
85
|
URL.new(@uri).open_stream
|
78
86
|
end
|
79
87
|
end
|
data/lib/rika/version.rb
CHANGED
data/spec/rika_spec.rb
CHANGED
@@ -15,7 +15,7 @@ describe Rika::Parser do
|
|
15
15
|
@dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
|
16
16
|
port = 50505
|
17
17
|
@url = "http://#{Socket.gethostname}:#{port}"
|
18
|
-
@
|
18
|
+
@quote = "First they ignore you, then they ridicule you, then they fight you, then you win."
|
19
19
|
@t1 = Thread.new do
|
20
20
|
@server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
|
21
21
|
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
@@ -28,11 +28,11 @@ describe Rika::Parser do
|
|
28
28
|
end
|
29
29
|
|
30
30
|
it "should raise error if file does not exists" do
|
31
|
-
lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError
|
31
|
+
lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError)
|
32
32
|
end
|
33
33
|
|
34
34
|
it "should raise error if URL does not exists" do
|
35
|
-
lambda { Rika::Parser.new("http://nonsense.com/whatever.pdf") }.should raise_error(IOError
|
35
|
+
lambda { Rika::Parser.new("http://nonsense.com/whatever.pdf") }.should raise_error(IOError)
|
36
36
|
end
|
37
37
|
|
38
38
|
it "should detect file type without a file extension" do
|
@@ -41,20 +41,20 @@ describe Rika::Parser do
|
|
41
41
|
end
|
42
42
|
|
43
43
|
it "should not be possible to trick the parser to read a folder with an extension" do
|
44
|
-
lambda { Rika::Parser.new(file_path("folder.js")).content }.should raise_error(IOError
|
44
|
+
lambda { Rika::Parser.new(file_path("folder.js")).content }.should raise_error(IOError)
|
45
45
|
end
|
46
46
|
|
47
47
|
describe '#content' do
|
48
48
|
it "should return the content in a text file" do
|
49
|
-
@txt_parser.content.strip.should == @
|
49
|
+
@txt_parser.content.strip.should == @quote
|
50
50
|
end
|
51
51
|
|
52
52
|
it "should return the content in a docx file" do
|
53
|
-
@docx_parser.content.should == @
|
53
|
+
@docx_parser.content.should == @quote
|
54
54
|
end
|
55
55
|
|
56
56
|
it "should return the content in a pdf file" do
|
57
|
-
@pdf_parser.content.should == @
|
57
|
+
@pdf_parser.content.should == @quote
|
58
58
|
end
|
59
59
|
|
60
60
|
it "should return no content for an image" do
|
@@ -78,7 +78,7 @@ describe Rika::Parser do
|
|
78
78
|
|
79
79
|
it "should return the content from a file over http" do
|
80
80
|
parser = Rika::Parser.new(@url + "/document.pdf")
|
81
|
-
parser.content.should == @
|
81
|
+
parser.content.should == @quote
|
82
82
|
end
|
83
83
|
|
84
84
|
it "should return empty string for unknown file" do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.8
|
5
5
|
prerelease:
|
6
6
|
platform: java
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-01-
|
12
|
+
date: 2013-01-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|