rika 0.9.3-java → 0.9.4-java
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +5 -1
- data/lib/rika.rb +6 -0
- data/lib/rika/version.rb +1 -1
- data/spec/fixtures/unknown.bin +0 -0
- data/spec/rika_spec.rb +29 -5
- metadata +4 -2
data/README.md
CHANGED
@@ -31,6 +31,10 @@ Something like this:
|
|
31
31
|
# Return the content of the document:
|
32
32
|
parser.content
|
33
33
|
|
34
|
+
# Return the media type for the document:
|
35
|
+
parser.media_type
|
36
|
+
=> "application/pdf"
|
37
|
+
|
34
38
|
# Return the metadata field title if it exists:
|
35
39
|
parser.metadata["title"] if parser.metadata_exists?("title")
|
36
40
|
|
@@ -42,7 +46,7 @@ Something like this:
|
|
42
46
|
parser.content # 10000 first chars returned
|
43
47
|
|
44
48
|
# Return content from URL
|
45
|
-
parser = Rika::Parser.new('http://
|
49
|
+
parser = Rika::Parser.new('http://riakhandbook.com/sample.pdf', 200)
|
46
50
|
parser.content
|
47
51
|
```
|
48
52
|
## Contributing
|
data/lib/rika.rb
CHANGED
@@ -46,6 +46,10 @@ module Rika
|
|
46
46
|
metadata_hash
|
47
47
|
end
|
48
48
|
|
49
|
+
def media_type
|
50
|
+
@media_type
|
51
|
+
end
|
52
|
+
|
49
53
|
def available_metadata
|
50
54
|
@metadata.names.to_a
|
51
55
|
end
|
@@ -59,6 +63,7 @@ module Rika
|
|
59
63
|
def parse_file
|
60
64
|
input_stream = java.io.FileInputStream.new(java.io.File.new(@uri))
|
61
65
|
@metadata.set("filename", File.basename(@uri))
|
66
|
+
@media_type = @tika.detect(java.io.FileInputStream.new(java.io.File.new(@uri)))
|
62
67
|
@content = @tika.parse_to_string(input_stream, @metadata)
|
63
68
|
end
|
64
69
|
|
@@ -66,6 +71,7 @@ module Rika
|
|
66
71
|
raise IOError, "File does not exist or can't be reached." if not Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
|
67
72
|
url = java.net.URL.new(@uri)
|
68
73
|
input_stream = url.open_stream
|
74
|
+
@media_type = @tika.detect(url.open_stream)
|
69
75
|
@metadata.set("url", @uri)
|
70
76
|
@content = @tika.parse_to_string(input_stream, @metadata)
|
71
77
|
end
|
data/lib/rika/version.rb
CHANGED
Binary file
|
data/spec/rika_spec.rb
CHANGED
@@ -11,10 +11,11 @@ describe Rika::Parser do
|
|
11
11
|
@docx_parser = Rika::Parser.new(file_path("document.docx"))
|
12
12
|
@pdf_parser = Rika::Parser.new(file_path("document.pdf"))
|
13
13
|
@image_parser = Rika::Parser.new(file_path("image.jpg"))
|
14
|
+
@unknown_parser = Rika::Parser.new(file_path("unknown.bin"))
|
14
15
|
@dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
|
15
16
|
port = 50505
|
16
17
|
@url = "http://#{Socket.gethostname}:#{port}"
|
17
|
-
|
18
|
+
@qoute = "First they ignore you, then they ridicule you, then they fight you, then you win."
|
18
19
|
@t1 = Thread.new do
|
19
20
|
@server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
|
20
21
|
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
@@ -41,15 +42,15 @@ describe Rika::Parser do
|
|
41
42
|
|
42
43
|
describe '#content' do
|
43
44
|
it "should return the content in a text file" do
|
44
|
-
@txt_parser.content.strip.should ==
|
45
|
+
@txt_parser.content.strip.should == @qoute
|
45
46
|
end
|
46
47
|
|
47
48
|
it "should return the content in a docx file" do
|
48
|
-
@docx_parser.content.should ==
|
49
|
+
@docx_parser.content.should == @qoute
|
49
50
|
end
|
50
51
|
|
51
52
|
it "should return the content in a pdf file" do
|
52
|
-
@pdf_parser.content.should ==
|
53
|
+
@pdf_parser.content.should == @qoute
|
53
54
|
end
|
54
55
|
|
55
56
|
it "should return no content for an image" do
|
@@ -73,7 +74,11 @@ describe Rika::Parser do
|
|
73
74
|
|
74
75
|
it "should return the content from a file over http" do
|
75
76
|
parser = Rika::Parser.new(@url + "/document.pdf")
|
76
|
-
parser.content.should ==
|
77
|
+
parser.content.should == @qoute
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should return empty string for unknown file" do
|
81
|
+
@unknown_parser.content.should be_empty
|
77
82
|
end
|
78
83
|
end
|
79
84
|
|
@@ -127,4 +132,23 @@ describe Rika::Parser do
|
|
127
132
|
@docx_parser.metadata_exists?("title").should == true
|
128
133
|
end
|
129
134
|
end
|
135
|
+
|
136
|
+
describe '#media_type' do
|
137
|
+
it "should return application/pdf for a pdf file" do
|
138
|
+
@pdf_parser.media_type.should == "application/pdf"
|
139
|
+
end
|
140
|
+
|
141
|
+
it "should return text/plain for a txt file" do
|
142
|
+
@txt_parser.media_type.should == "text/plain"
|
143
|
+
end
|
144
|
+
|
145
|
+
it "should return application/pdf for a pdf over http" do
|
146
|
+
parser = Rika::Parser.new(@url + "/document.pdf")
|
147
|
+
parser.media_type.should == "application/pdf"
|
148
|
+
end
|
149
|
+
|
150
|
+
it "should return application/octet-stream for unknown file" do
|
151
|
+
@unknown_parser.media_type.should == "application/octet-stream"
|
152
|
+
end
|
153
|
+
end
|
130
154
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.4
|
5
5
|
prerelease:
|
6
6
|
platform: java
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -54,6 +54,7 @@ files:
|
|
54
54
|
- spec/fixtures/over_100k_file.txt
|
55
55
|
- spec/fixtures/text_file.txt
|
56
56
|
- spec/fixtures/text_file_without_extension
|
57
|
+
- spec/fixtures/unknown.bin
|
57
58
|
- spec/rika_spec.rb
|
58
59
|
- spec/spec_helper.rb
|
59
60
|
- target/dependency/apache-mime4j-core-0.7.2.jar
|
@@ -124,5 +125,6 @@ test_files:
|
|
124
125
|
- spec/fixtures/over_100k_file.txt
|
125
126
|
- spec/fixtures/text_file.txt
|
126
127
|
- spec/fixtures/text_file_without_extension
|
128
|
+
- spec/fixtures/unknown.bin
|
127
129
|
- spec/rika_spec.rb
|
128
130
|
- spec/spec_helper.rb
|