rika 0.9.4-java → 0.9.5-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rika.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
2
4
 
3
5
  require "rika/version"
@@ -13,30 +15,32 @@ end
13
15
  module Rika
14
16
  import org.apache.tika.metadata.Metadata
15
17
  import org.apache.tika.Tika
18
+ import java.io.FileInputStream
19
+ import java.net.URL
16
20
 
17
21
  class Parser
18
22
 
19
- def initialize(uri, max_content_length = -1)
20
- p = URI::Parser.new
21
- @uri = uri
23
+ def initialize(file_location, max_content_length = -1)
24
+ @uri = file_location
22
25
  @tika = Tika.new
23
26
  @tika.set_max_string_length(max_content_length)
24
27
  @metadata = Metadata.new
28
+
29
+ @is_file = File.exists?(@uri)
30
+ is_http = URI(@uri).scheme == "http" && Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess) if !@is_file
25
31
 
26
- if File.exists?(@uri)
27
- self.parse_file
28
- elsif p.parse(@uri).scheme == 'http' || p.parse(@uri).scheme == 'https'
29
- self.parse_url
30
- else
32
+ if !@is_file && !is_http
31
33
  raise IOError, "File does not exist or can't be reached."
32
34
  end
33
35
  end
34
36
 
35
37
  def content
36
- @content.to_s.strip
38
+ self.parse
39
+ @content
37
40
  end
38
41
 
39
42
  def metadata
43
+ self.parse
40
44
  metadata_hash = {}
41
45
 
42
46
  @metadata.names.each do |name|
@@ -47,33 +51,31 @@ module Rika
47
51
  end
48
52
 
49
53
  def media_type
50
- @media_type
54
+ @media_type ||= @tika.detect(input_stream)
51
55
  end
52
56
 
53
57
  def available_metadata
58
+ self.parse
54
59
  @metadata.names.to_a
55
60
  end
56
61
 
57
62
  def metadata_exists?(name)
63
+ self.parse
58
64
  @metadata.get(name) != nil
59
65
  end
60
66
 
61
67
  protected
62
68
 
63
- def parse_file
64
- input_stream = java.io.FileInputStream.new(java.io.File.new(@uri))
65
- @metadata.set("filename", File.basename(@uri))
66
- @media_type = @tika.detect(java.io.FileInputStream.new(java.io.File.new(@uri)))
67
- @content = @tika.parse_to_string(input_stream, @metadata)
69
+ def parse
70
+ @content ||= @tika.parse_to_string(input_stream, @metadata).to_s.strip
68
71
  end
69
72
 
70
- def parse_url
71
- raise IOError, "File does not exist or can't be reached." if not Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
72
- url = java.net.URL.new(@uri)
73
- input_stream = url.open_stream
74
- @media_type = @tika.detect(url.open_stream)
75
- @metadata.set("url", @uri)
76
- @content = @tika.parse_to_string(input_stream, @metadata)
73
+ def input_stream
74
+ if @is_file
75
+ FileInputStream.new(java.io.File.new(@uri))
76
+ else
77
+ URL.new(@uri).open_stream
78
+ end
77
79
  end
78
80
  end
79
81
  end
data/lib/rika/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Rika
2
- VERSION = "0.9.4"
2
+ VERSION = "0.9.5"
3
3
  end
data/spec/rika_spec.rb CHANGED
@@ -90,10 +90,6 @@ describe Rika::Parser do
90
90
  @txt_parser.metadata["nonsense"].should be_nil
91
91
  end
92
92
 
93
- it "should return metadata from a text file" do
94
- @txt_parser.metadata["filename"].should == "text_file.txt"
95
- end
96
-
97
93
  it "should return metadata from a docx file" do
98
94
  @docx_parser.metadata["Page-Count"].should == "1"
99
95
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rika
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.4
4
+ version: 0.9.5
5
5
  prerelease:
6
6
  platform: java
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-25 00:00:00.000000000 Z
12
+ date: 2012-09-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec