rika 0.9.3-java → 0.9.4-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -31,6 +31,10 @@ Something like this:
31
31
  # Return the content of the document:
32
32
  parser.content
33
33
 
34
+ # Return the media type for the document:
35
+ parser.media_type
36
+ => "application/pdf"
37
+
34
38
  # Return the metadata field title if it exists:
35
39
  parser.metadata["title"] if parser.metadata_exists?("title")
36
40
 
@@ -42,7 +46,7 @@ Something like this:
42
46
  parser.content # 10000 first chars returned
43
47
 
44
48
  # Return content from URL
45
- parser = Rika::Parser.new('http://www.exampleurl.com/example.pdf')
49
+ parser = Rika::Parser.new('http://riakhandbook.com/sample.pdf', 200)
46
50
  parser.content
47
51
  ```
48
52
  ## Contributing
@@ -46,6 +46,10 @@ module Rika
46
46
  metadata_hash
47
47
  end
48
48
 
49
+ def media_type
50
+ @media_type
51
+ end
52
+
49
53
  def available_metadata
50
54
  @metadata.names.to_a
51
55
  end
@@ -59,6 +63,7 @@ module Rika
59
63
  def parse_file
60
64
  input_stream = java.io.FileInputStream.new(java.io.File.new(@uri))
61
65
  @metadata.set("filename", File.basename(@uri))
66
+ @media_type = @tika.detect(java.io.FileInputStream.new(java.io.File.new(@uri)))
62
67
  @content = @tika.parse_to_string(input_stream, @metadata)
63
68
  end
64
69
 
@@ -66,6 +71,7 @@ module Rika
66
71
  raise IOError, "File does not exist or can't be reached." if not Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
67
72
  url = java.net.URL.new(@uri)
68
73
  input_stream = url.open_stream
74
+ @media_type = @tika.detect(url.open_stream)
69
75
  @metadata.set("url", @uri)
70
76
  @content = @tika.parse_to_string(input_stream, @metadata)
71
77
  end
@@ -1,3 +1,3 @@
1
1
  module Rika
2
- VERSION = "0.9.3"
2
+ VERSION = "0.9.4"
3
3
  end
Binary file
@@ -11,10 +11,11 @@ describe Rika::Parser do
11
11
  @docx_parser = Rika::Parser.new(file_path("document.docx"))
12
12
  @pdf_parser = Rika::Parser.new(file_path("document.pdf"))
13
13
  @image_parser = Rika::Parser.new(file_path("image.jpg"))
14
+ @unknown_parser = Rika::Parser.new(file_path("unknown.bin"))
14
15
  @dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
15
16
  port = 50505
16
17
  @url = "http://#{Socket.gethostname}:#{port}"
17
-
18
+ @qoute = "First they ignore you, then they ridicule you, then they fight you, then you win."
18
19
  @t1 = Thread.new do
19
20
  @server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
20
21
  :AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
@@ -41,15 +42,15 @@ describe Rika::Parser do
41
42
 
42
43
  describe '#content' do
43
44
  it "should return the content in a text file" do
44
- @txt_parser.content.strip.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
45
+ @txt_parser.content.strip.should == @qoute
45
46
  end
46
47
 
47
48
  it "should return the content in a docx file" do
48
- @docx_parser.content.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
49
+ @docx_parser.content.should == @qoute
49
50
  end
50
51
 
51
52
  it "should return the content in a pdf file" do
52
- @pdf_parser.content.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
53
+ @pdf_parser.content.should == @qoute
53
54
  end
54
55
 
55
56
  it "should return no content for an image" do
@@ -73,7 +74,11 @@ describe Rika::Parser do
73
74
 
74
75
  it "should return the content from a file over http" do
75
76
  parser = Rika::Parser.new(@url + "/document.pdf")
76
- parser.content.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
77
+ parser.content.should == @qoute
78
+ end
79
+
80
+ it "should return empty string for unknown file" do
81
+ @unknown_parser.content.should be_empty
77
82
  end
78
83
  end
79
84
 
@@ -127,4 +132,23 @@ describe Rika::Parser do
127
132
  @docx_parser.metadata_exists?("title").should == true
128
133
  end
129
134
  end
135
+
136
+ describe '#media_type' do
137
+ it "should return application/pdf for a pdf file" do
138
+ @pdf_parser.media_type.should == "application/pdf"
139
+ end
140
+
141
+ it "should return text/plain for a txt file" do
142
+ @txt_parser.media_type.should == "text/plain"
143
+ end
144
+
145
+ it "should return application/pdf for a pdf over http" do
146
+ parser = Rika::Parser.new(@url + "/document.pdf")
147
+ parser.media_type.should == "application/pdf"
148
+ end
149
+
150
+ it "should return application/octet-stream for unknown file" do
151
+ @unknown_parser.media_type.should == "application/octet-stream"
152
+ end
153
+ end
130
154
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rika
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.3
4
+ version: 0.9.4
5
5
  prerelease:
6
6
  platform: java
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-23 00:00:00.000000000 Z
12
+ date: 2012-09-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -54,6 +54,7 @@ files:
54
54
  - spec/fixtures/over_100k_file.txt
55
55
  - spec/fixtures/text_file.txt
56
56
  - spec/fixtures/text_file_without_extension
57
+ - spec/fixtures/unknown.bin
57
58
  - spec/rika_spec.rb
58
59
  - spec/spec_helper.rb
59
60
  - target/dependency/apache-mime4j-core-0.7.2.jar
@@ -124,5 +125,6 @@ test_files:
124
125
  - spec/fixtures/over_100k_file.txt
125
126
  - spec/fixtures/text_file.txt
126
127
  - spec/fixtures/text_file_without_extension
128
+ - spec/fixtures/unknown.bin
127
129
  - spec/rika_spec.rb
128
130
  - spec/spec_helper.rb