rtika 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +5 -0
- data/VERSION +1 -1
- data/lib/rtika.rb +42 -21
- data/lib/{tika-app-0.7.jar → tika-app-0.9-SNAPSHOT.jar} +0 -0
- data/lib/tika-bundle-0.9-SNAPSHOT.jar +0 -0
- data/lib/{tika-core-0.7.jar → tika-core-0.9-SNAPSHOT.jar} +0 -0
- data/lib/tika-parsers-0.9-SNAPSHOT.jar +0 -0
- data/rtika.gemspec +6 -6
- metadata +7 -7
- data/lib/tika-bundle-0.7.jar +0 -0
- data/lib/tika-parsers-0.7.jar +0 -0
data/README.rdoc
CHANGED
@@ -20,6 +20,11 @@ Make sure you're on JRuby first.
|
|
20
20
|
puts result.content # returns <body> contents
|
21
21
|
puts result.title # returns <title> contents
|
22
22
|
|
23
|
+
Options
|
24
|
+
|
25
|
+
:remove_boilerplate => true
|
26
|
+
# uses the Boilerpipe library that ships with Tika to remove headers & footers
|
27
|
+
|
23
28
|
== Note on Patches/Pull Requests
|
24
29
|
|
25
30
|
* Fork the project.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/lib/rtika.rb
CHANGED
@@ -7,7 +7,9 @@ Dir[File.join(File.dirname(__FILE__), "*.jar")].each do |jar|
|
|
7
7
|
end
|
8
8
|
|
9
9
|
module RTika
|
10
|
+
import org.apache.tika.parser.html.BoilerpipeContentHandler
|
10
11
|
import org.apache.tika.sax.BodyContentHandler
|
12
|
+
import org.apache.tika.sax.WriteOutContentHandler
|
11
13
|
import org.apache.tika.parser.AutoDetectParser
|
12
14
|
import org.apache.tika.metadata.Metadata
|
13
15
|
|
@@ -43,10 +45,32 @@ module RTika
|
|
43
45
|
new(*args).parse
|
44
46
|
end
|
45
47
|
|
48
|
+
def remove_boilerplate?
|
49
|
+
@options[:remove_boilerplate] && @options[:remove_boilerplate] == true
|
50
|
+
end
|
51
|
+
|
52
|
+
def initialize(*args)
|
53
|
+
@options = args.last
|
54
|
+
|
55
|
+
if remove_boilerplate?
|
56
|
+
@writeout_content = RTika::WriteOutContentHandler.new(-1)
|
57
|
+
@content = RTika::BoilerpipeContentHandler.new(@writeout_content)
|
58
|
+
else
|
59
|
+
@content = RTika::BodyContentHandler.new(-1)
|
60
|
+
end
|
61
|
+
|
62
|
+
@metadata = RTika::Metadata.new
|
63
|
+
end
|
64
|
+
|
46
65
|
def parse
|
47
66
|
@parser = RTika::AutoDetectParser.new
|
48
|
-
content, metadata = process
|
49
|
-
|
67
|
+
@content, @metadata = process
|
68
|
+
|
69
|
+
if remove_boilerplate?
|
70
|
+
RTika::ParsedResult.new(@writeout_content, @metadata)
|
71
|
+
else
|
72
|
+
RTika::ParsedResult.new(@content, @metadata)
|
73
|
+
end
|
50
74
|
end
|
51
75
|
|
52
76
|
def process
|
@@ -55,56 +79,53 @@ module RTika
|
|
55
79
|
end
|
56
80
|
|
57
81
|
class StringParser < GenericParser
|
58
|
-
def initialize(string)
|
82
|
+
def initialize(string, opts={})
|
83
|
+
super(opts)
|
59
84
|
@input_string = string
|
60
85
|
end
|
61
86
|
|
62
87
|
def process
|
63
88
|
input_stream = java.io.ByteArrayInputStream.new(@input_string.to_java.get_bytes)
|
64
|
-
content = RTika::BodyContentHandler.new(-1)
|
65
|
-
metadata = RTika::Metadata.new
|
66
89
|
|
67
|
-
@parser.parse(input_stream, content, metadata)
|
90
|
+
@parser.parse(input_stream, @content, @metadata)
|
68
91
|
input_stream.close
|
69
92
|
|
70
|
-
return [content, metadata]
|
93
|
+
return [@content, @metadata]
|
71
94
|
end
|
72
95
|
end
|
73
96
|
|
74
97
|
class FileParser < GenericParser
|
75
|
-
def initialize(filename)
|
98
|
+
def initialize(filename, opts={})
|
99
|
+
super(opts)
|
76
100
|
@filename = filename
|
77
101
|
end
|
78
102
|
|
79
103
|
def process
|
80
104
|
input_stream = java.io.FileInputStream.new(java.io.File.new(@filename))
|
81
|
-
|
82
|
-
metadata = RTika::Metadata.new
|
83
|
-
metadata.set("filename", File.basename(@filename))
|
105
|
+
@metadata.set("filename", File.basename(@filename))
|
84
106
|
|
85
|
-
@parser.parse(input_stream, content, metadata)
|
107
|
+
@parser.parse(input_stream, @content, @metadata)
|
86
108
|
input_stream.close
|
87
109
|
|
88
|
-
return [content, metadata]
|
110
|
+
return [@content, @metadata]
|
89
111
|
end
|
90
112
|
end
|
91
113
|
|
92
114
|
class UrlParser < GenericParser
|
93
|
-
def initialize(url, content)
|
115
|
+
def initialize(url, content, opts={})
|
116
|
+
super(opts)
|
94
117
|
@url = url
|
95
|
-
@
|
118
|
+
@url_content = content
|
96
119
|
end
|
97
120
|
|
98
121
|
def process
|
99
|
-
input_stream = java.io.ByteArrayInputStream.new(@
|
100
|
-
|
101
|
-
metadata = RTika::Metadata.new
|
102
|
-
metadata.set("filename", File.basename(@url))
|
122
|
+
input_stream = java.io.ByteArrayInputStream.new(@url_content.to_java.get_bytes)
|
123
|
+
@metadata.set("filename", File.basename(@url))
|
103
124
|
|
104
|
-
@parser.parse(input_stream, content, metadata)
|
125
|
+
@parser.parse(input_stream, @content, @metadata)
|
105
126
|
input_stream.close
|
106
127
|
|
107
|
-
return [content, metadata]
|
128
|
+
return [@content, @metadata]
|
108
129
|
end
|
109
130
|
end
|
110
131
|
end
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/rtika.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{rtika}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Pradeep Elankumaran"]
|
12
|
-
s.date = %q{2010-11-
|
12
|
+
s.date = %q{2010-11-09}
|
13
13
|
s.description = %q{rTika is a JRuby wrapper around the Apache Tika content extraction library}
|
14
14
|
s.email = %q{pradeepe@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -24,10 +24,10 @@ Gem::Specification.new do |s|
|
|
24
24
|
"Rakefile",
|
25
25
|
"VERSION",
|
26
26
|
"lib/rtika.rb",
|
27
|
-
"lib/tika-app-0.
|
28
|
-
"lib/tika-bundle-0.
|
29
|
-
"lib/tika-core-0.
|
30
|
-
"lib/tika-parsers-0.
|
27
|
+
"lib/tika-app-0.9-SNAPSHOT.jar",
|
28
|
+
"lib/tika-bundle-0.9-SNAPSHOT.jar",
|
29
|
+
"lib/tika-core-0.9-SNAPSHOT.jar",
|
30
|
+
"lib/tika-parsers-0.9-SNAPSHOT.jar",
|
31
31
|
"rtika.gemspec",
|
32
32
|
"test/helper.rb",
|
33
33
|
"test/test_rtika.rb"
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
7
|
+
- 3
|
8
8
|
- 0
|
9
|
-
version: 0.
|
9
|
+
version: 0.3.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Pradeep Elankumaran
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-11-
|
17
|
+
date: 2010-11-09 00:00:00 -08:00
|
18
18
|
default_executable:
|
19
19
|
dependencies: []
|
20
20
|
|
@@ -35,10 +35,10 @@ files:
|
|
35
35
|
- Rakefile
|
36
36
|
- VERSION
|
37
37
|
- lib/rtika.rb
|
38
|
-
- lib/tika-app-0.
|
39
|
-
- lib/tika-bundle-0.
|
40
|
-
- lib/tika-core-0.
|
41
|
-
- lib/tika-parsers-0.
|
38
|
+
- lib/tika-app-0.9-SNAPSHOT.jar
|
39
|
+
- lib/tika-bundle-0.9-SNAPSHOT.jar
|
40
|
+
- lib/tika-core-0.9-SNAPSHOT.jar
|
41
|
+
- lib/tika-parsers-0.9-SNAPSHOT.jar
|
42
42
|
- rtika.gemspec
|
43
43
|
- test/helper.rb
|
44
44
|
- test/test_rtika.rb
|
data/lib/tika-bundle-0.7.jar
DELETED
Binary file
|
data/lib/tika-parsers-0.7.jar
DELETED
Binary file
|