rtika 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,11 @@ Make sure you're on JRuby first.
20
20
  puts result.content # returns <body> contents
21
21
  puts result.title # returns <title> contents
22
22
 
23
+ Options
24
+
25
+ :remove_boilerplate => true
26
+ # uses the Boilerpipe library that ships with Tika to remove headers & footers
27
+
23
28
  == Note on Patches/Pull Requests
24
29
 
25
30
  * Fork the project.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.3.0
@@ -7,7 +7,9 @@ Dir[File.join(File.dirname(__FILE__), "*.jar")].each do |jar|
7
7
  end
8
8
 
9
9
  module RTika
10
+ import org.apache.tika.parser.html.BoilerpipeContentHandler
10
11
  import org.apache.tika.sax.BodyContentHandler
12
+ import org.apache.tika.sax.WriteOutContentHandler
11
13
  import org.apache.tika.parser.AutoDetectParser
12
14
  import org.apache.tika.metadata.Metadata
13
15
 
@@ -43,10 +45,32 @@ module RTika
43
45
  new(*args).parse
44
46
  end
45
47
 
48
+ def remove_boilerplate?
49
+ @options[:remove_boilerplate] && @options[:remove_boilerplate] == true
50
+ end
51
+
52
+ def initialize(*args)
53
+ @options = args.last
54
+
55
+ if remove_boilerplate?
56
+ @writeout_content = RTika::WriteOutContentHandler.new(-1)
57
+ @content = RTika::BoilerpipeContentHandler.new(@writeout_content)
58
+ else
59
+ @content = RTika::BodyContentHandler.new(-1)
60
+ end
61
+
62
+ @metadata = RTika::Metadata.new
63
+ end
64
+
46
65
  def parse
47
66
  @parser = RTika::AutoDetectParser.new
48
- content, metadata = process
49
- RTika::ParsedResult.new(content, metadata)
67
+ @content, @metadata = process
68
+
69
+ if remove_boilerplate?
70
+ RTika::ParsedResult.new(@writeout_content, @metadata)
71
+ else
72
+ RTika::ParsedResult.new(@content, @metadata)
73
+ end
50
74
  end
51
75
 
52
76
  def process
@@ -55,56 +79,53 @@ module RTika
55
79
  end
56
80
 
57
81
  class StringParser < GenericParser
58
- def initialize(string)
82
+ def initialize(string, opts={})
83
+ super(opts)
59
84
  @input_string = string
60
85
  end
61
86
 
62
87
  def process
63
88
  input_stream = java.io.ByteArrayInputStream.new(@input_string.to_java.get_bytes)
64
- content = RTika::BodyContentHandler.new(-1)
65
- metadata = RTika::Metadata.new
66
89
 
67
- @parser.parse(input_stream, content, metadata)
90
+ @parser.parse(input_stream, @content, @metadata)
68
91
  input_stream.close
69
92
 
70
- return [content, metadata]
93
+ return [@content, @metadata]
71
94
  end
72
95
  end
73
96
 
74
97
  class FileParser < GenericParser
75
- def initialize(filename)
98
+ def initialize(filename, opts={})
99
+ super(opts)
76
100
  @filename = filename
77
101
  end
78
102
 
79
103
  def process
80
104
  input_stream = java.io.FileInputStream.new(java.io.File.new(@filename))
81
- content = RTika::BodyContentHandler.new(-1)
82
- metadata = RTika::Metadata.new
83
- metadata.set("filename", File.basename(@filename))
105
+ @metadata.set("filename", File.basename(@filename))
84
106
 
85
- @parser.parse(input_stream, content, metadata)
107
+ @parser.parse(input_stream, @content, @metadata)
86
108
  input_stream.close
87
109
 
88
- return [content, metadata]
110
+ return [@content, @metadata]
89
111
  end
90
112
  end
91
113
 
92
114
  class UrlParser < GenericParser
93
- def initialize(url, content)
115
+ def initialize(url, content, opts={})
116
+ super(opts)
94
117
  @url = url
95
- @content = content
118
+ @url_content = content
96
119
  end
97
120
 
98
121
  def process
99
- input_stream = java.io.ByteArrayInputStream.new(@content.to_java.get_bytes)
100
- content = RTika::BodyContentHandler.new(-1)
101
- metadata = RTika::Metadata.new
102
- metadata.set("filename", File.basename(@url))
122
+ input_stream = java.io.ByteArrayInputStream.new(@url_content.to_java.get_bytes)
123
+ @metadata.set("filename", File.basename(@url))
103
124
 
104
- @parser.parse(input_stream, content, metadata)
125
+ @parser.parse(input_stream, @content, @metadata)
105
126
  input_stream.close
106
127
 
107
- return [content, metadata]
128
+ return [@content, @metadata]
108
129
  end
109
130
  end
110
131
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{rtika}
8
- s.version = "0.2.0"
8
+ s.version = "0.3.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Pradeep Elankumaran"]
12
- s.date = %q{2010-11-03}
12
+ s.date = %q{2010-11-09}
13
13
  s.description = %q{rTika is a JRuby wrapper around the Apache Tika content extraction library}
14
14
  s.email = %q{pradeepe@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -24,10 +24,10 @@ Gem::Specification.new do |s|
24
24
  "Rakefile",
25
25
  "VERSION",
26
26
  "lib/rtika.rb",
27
- "lib/tika-app-0.7.jar",
28
- "lib/tika-bundle-0.7.jar",
29
- "lib/tika-core-0.7.jar",
30
- "lib/tika-parsers-0.7.jar",
27
+ "lib/tika-app-0.9-SNAPSHOT.jar",
28
+ "lib/tika-bundle-0.9-SNAPSHOT.jar",
29
+ "lib/tika-core-0.9-SNAPSHOT.jar",
30
+ "lib/tika-parsers-0.9-SNAPSHOT.jar",
31
31
  "rtika.gemspec",
32
32
  "test/helper.rb",
33
33
  "test/test_rtika.rb"
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 2
7
+ - 3
8
8
  - 0
9
- version: 0.2.0
9
+ version: 0.3.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Pradeep Elankumaran
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-11-03 00:00:00 -07:00
17
+ date: 2010-11-09 00:00:00 -08:00
18
18
  default_executable:
19
19
  dependencies: []
20
20
 
@@ -35,10 +35,10 @@ files:
35
35
  - Rakefile
36
36
  - VERSION
37
37
  - lib/rtika.rb
38
- - lib/tika-app-0.7.jar
39
- - lib/tika-bundle-0.7.jar
40
- - lib/tika-core-0.7.jar
41
- - lib/tika-parsers-0.7.jar
38
+ - lib/tika-app-0.9-SNAPSHOT.jar
39
+ - lib/tika-bundle-0.9-SNAPSHOT.jar
40
+ - lib/tika-core-0.9-SNAPSHOT.jar
41
+ - lib/tika-parsers-0.9-SNAPSHOT.jar
42
42
  - rtika.gemspec
43
43
  - test/helper.rb
44
44
  - test/test_rtika.rb
Binary file
Binary file