rika 0.9.2-java → 0.9.3-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -39,7 +39,11 @@ Something like this:
39
39
 
40
40
  # Return only the first 10000 chars of the content:
41
41
  parser = Rika::Parser.new('document.pdf', 10000)
42
- parser.content # 10000 first chars returned
42
+ parser.content # 10000 first chars returned
43
+
44
+ # Return content from URL
45
+ parser = Rika::Parser.new('http://www.exampleurl.com/example.pdf')
46
+ parser.content
43
47
  ```
44
48
  ## Contributing
45
49
 
data/lib/rika.rb CHANGED
@@ -1,24 +1,34 @@
1
1
  raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
2
2
 
3
3
  require "rika/version"
4
+ require 'uri'
5
+ require 'net/http'
4
6
  require 'java'
5
7
 
6
8
  Dir[File.join(File.dirname(__FILE__), "../target/dependency/*.jar")].each do |jar|
7
9
  require jar
8
10
  end
9
11
 
12
+ # Heavily based on the Apache Tika API: http://tika.apache.org/1.2/api/org/apache/tika/Tika.html
10
13
  module Rika
11
14
  import org.apache.tika.metadata.Metadata
12
15
  import org.apache.tika.Tika
16
+
13
17
  class Parser
14
18
 
15
- def initialize(filename, max_content_length = -1)
16
- if File.exists?(filename)
17
- @filename = filename
18
- @max_content_length = max_content_length
19
- self.perform
19
+ def initialize(uri, max_content_length = -1)
20
+ p = URI::Parser.new
21
+ @uri = uri
22
+ @tika = Tika.new
23
+ @tika.set_max_string_length(max_content_length)
24
+ @metadata = Metadata.new
25
+
26
+ if File.exists?(@uri)
27
+ self.parse_file
28
+ elsif p.parse(@uri).scheme == 'http' || p.parse(@uri).scheme == 'https'
29
+ self.parse_url
20
30
  else
21
- raise IOError, "File does not exist"
31
+ raise IOError, "File does not exist or can't be reached."
22
32
  end
23
33
  end
24
34
 
@@ -41,26 +51,23 @@ module Rika
41
51
  end
42
52
 
43
53
  def metadata_exists?(name)
44
- if @metadata.get(name) == nil
45
- false
46
- else
47
- true
48
- end
54
+ @metadata.get(name) != nil
49
55
  end
50
56
 
51
57
  protected
52
58
 
53
- def perform
54
- input_stream = nil
55
- begin
56
- input_stream = java.io.FileInputStream.new(java.io.File.new(@filename))
57
- @metadata = Metadata.new
58
- @metadata.set("filename", File.basename(@filename))
59
- @tika = Tika.new
60
- @content = @tika.parse_to_string(input_stream, @metadata, @max_content_length)
61
- ensure
62
- input_stream.close
63
- end
59
+ def parse_file
60
+ input_stream = java.io.FileInputStream.new(java.io.File.new(@uri))
61
+ @metadata.set("filename", File.basename(@uri))
62
+ @content = @tika.parse_to_string(input_stream, @metadata)
63
+ end
64
+
65
+ def parse_url
66
+ raise IOError, "File does not exist or can't be reached." if not Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
67
+ url = java.net.URL.new(@uri)
68
+ input_stream = url.open_stream
69
+ @metadata.set("url", @uri)
70
+ @content = @tika.parse_to_string(input_stream, @metadata)
64
71
  end
65
72
  end
66
73
  end
data/lib/rika/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Rika
2
- VERSION = "0.9.2"
2
+ VERSION = "0.9.3"
3
3
  end
data/spec/rika_spec.rb CHANGED
@@ -1,17 +1,37 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  require 'spec_helper'
4
+ require 'webrick'
4
5
 
6
+ include WEBrick
7
+
5
8
  describe Rika::Parser do
6
9
  before(:all) do
7
10
  @txt_parser = Rika::Parser.new(file_path("text_file.txt"))
8
11
  @docx_parser = Rika::Parser.new(file_path("document.docx"))
9
12
  @pdf_parser = Rika::Parser.new(file_path("document.pdf"))
10
13
  @image_parser = Rika::Parser.new(file_path("image.jpg"))
14
+ @dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
15
+ port = 50505
16
+ @url = "http://#{Socket.gethostname}:#{port}"
17
+
18
+ @t1 = Thread.new do
19
+ @server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
20
+ :AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
21
+ @server.start
22
+ end
23
+ end
24
+
25
+ after(:all) do
26
+ @t1.exit
11
27
  end
12
28
 
13
29
  it "should raise error if file does not exists" do
14
- lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError, "File does not exist")
30
+ lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError, "File does not exist or can't be reached.")
31
+ end
32
+
33
+ it "should raise error if URL does not exists" do
34
+ lambda { Rika::Parser.new("http://nonsense.com/whatever.pdf") }.should raise_error(IOError, "File does not exist or can't be reached.")
15
35
  end
16
36
 
17
37
  it "should detect file type without a file extension" do
@@ -41,10 +61,20 @@ describe Rika::Parser do
41
61
  parser.content.should == "First"
42
62
  end
43
63
 
64
+ it "should only return max content length for file over http" do
65
+ parser = Rika::Parser.new(@url + "/document.pdf", 6)
66
+ parser.content.should == "First"
67
+ end
68
+
44
69
  it "should be possible to read files over 100k by default" do
45
70
  parser = Rika::Parser.new(file_path("over_100k_file.txt"))
46
71
  parser.content.length.should == 101_761
47
72
  end
73
+
74
+ it "should return the content from a file over http" do
75
+ parser = Rika::Parser.new(@url + "/document.pdf")
76
+ parser.content.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
77
+ end
48
78
  end
49
79
 
50
80
  # We just test a few of the metadata fields for some common file formats
@@ -67,6 +97,11 @@ describe Rika::Parser do
67
97
  @pdf_parser.metadata["title"].should == "A simple title"
68
98
  end
69
99
 
100
+ it "should return metadata from a file over http" do
101
+ parser = Rika::Parser.new(@url + "/document.pdf")
102
+ parser.metadata["title"].should == "A simple title"
103
+ end
104
+
70
105
  it "should return metadata from an image" do
71
106
  @image_parser.metadata["Image Height"].should == "72 pixels"
72
107
  @image_parser.metadata["Image Width"].should == "72 pixels"
data/spec/spec_helper.rb CHANGED
@@ -3,6 +3,7 @@ require "rika"
3
3
  def file_path( *paths )
4
4
  File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
5
5
  end
6
+
6
7
  # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
8
  RSpec.configure do |config|
8
9
  config.treat_symbols_as_metadata_keys_with_true_values = true
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rika
3
3
  version: !ruby/object:Gem::Version
4
+ version: 0.9.3
4
5
  prerelease:
5
- version: 0.9.2
6
6
  platform: java
7
7
  authors:
8
8
  - Richard Nyström
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-17 00:00:00.000000000 Z
12
+ date: 2012-09-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec