rika 1.6.0-java → 2.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +6 -4
- data/.rubocop.yml +49 -0
- data/Gemfile +12 -0
- data/README.md +226 -47
- data/RELEASE_NOTES.md +43 -0
- data/Rakefile +4 -7
- data/bin/rika +13 -0
- data/lib/rika/cli/args_parser.rb +131 -0
- data/lib/rika/cli/rika_command.rb +129 -0
- data/lib/rika/formatters.rb +39 -0
- data/lib/rika/parse_result.rb +34 -0
- data/lib/rika/parser.rb +84 -0
- data/lib/rika/tika_loader.rb +65 -0
- data/lib/rika/version.rb +3 -1
- data/lib/rika.rb +96 -104
- data/pom.xml +2 -2
- data/rika.gemspec +30 -15
- data/rika_helper.rb +30 -0
- data/spec/fixtures/de.txt +21 -1
- data/spec/fixtures/document.doc +0 -0
- data/spec/fixtures/document.docx +0 -0
- data/spec/fixtures/document.pdf +0 -0
- data/spec/fixtures/document.txt +23 -0
- data/spec/fixtures/en.txt +23 -1
- data/spec/fixtures/es.txt +21 -1
- data/spec/fixtures/fr.txt +23 -1
- data/spec/fixtures/image_jpg_without_extension +0 -0
- data/spec/fixtures/ru.txt +21 -1
- data/spec/fixtures/tiny.txt +1 -0
- data/spec/rika/cli/args_parser_spec.rb +117 -0
- data/spec/rika/cli/rika_command_spec.rb +120 -0
- data/spec/rika/formatters_spec.rb +23 -0
- data/spec/rika/parse_result_spec.rb +42 -0
- data/spec/rika/parser_spec.rb +304 -0
- data/spec/rika/rika_spec.rb +10 -0
- data/spec/rika/tika_loader_spec.rb +57 -0
- data/spec/spec_helper.rb +13 -5
- metadata +54 -98
- data/.travis.yml +0 -7
- data/spec/fixtures/over_100k_file.txt +0 -1241
- data/spec/fixtures/text_file.txt +0 -1
- data/spec/fixtures/text_file_without_extension +0 -1
- data/spec/rika_spec.rb +0 -202
- data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
- data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
- data/target/dependency/asm-debug-all-4.1.jar +0 -0
- data/target/dependency/aspectjrt-1.8.0.jar +0 -0
- data/target/dependency/bcmail-jdk15-1.45.jar +0 -0
- data/target/dependency/bcprov-jdk15-1.45.jar +0 -0
- data/target/dependency/boilerpipe-1.1.0.jar +0 -0
- data/target/dependency/commons-codec-1.9.jar +0 -0
- data/target/dependency/commons-compress-1.8.1.jar +0 -0
- data/target/dependency/commons-httpclient-3.1.jar +0 -0
- data/target/dependency/commons-logging-1.1.1.jar +0 -0
- data/target/dependency/fontbox-1.8.6.jar +0 -0
- data/target/dependency/isoparser-1.0.2.jar +0 -0
- data/target/dependency/java-libpst-0.8.1.jar +0 -0
- data/target/dependency/jcip-annotations-1.0.jar +0 -0
- data/target/dependency/jdom-1.0.jar +0 -0
- data/target/dependency/jempbox-1.8.6.jar +0 -0
- data/target/dependency/jhighlight-1.0.jar +0 -0
- data/target/dependency/jmatio-1.0.jar +0 -0
- data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
- data/target/dependency/metadata-extractor-2.6.2.jar +0 -0
- data/target/dependency/netcdf-4.2.20.jar +0 -0
- data/target/dependency/pdfbox-1.8.6.jar +0 -0
- data/target/dependency/poi-3.11-beta2.jar +0 -0
- data/target/dependency/poi-ooxml-3.11-beta2.jar +0 -0
- data/target/dependency/poi-ooxml-schemas-3.11-beta2.jar +0 -0
- data/target/dependency/poi-scratchpad-3.11-beta2.jar +0 -0
- data/target/dependency/rome-1.0.jar +0 -0
- data/target/dependency/slf4j-api-1.6.1.jar +0 -0
- data/target/dependency/tagsoup-1.2.1.jar +0 -0
- data/target/dependency/tika-core-1.6.jar +0 -0
- data/target/dependency/tika-parsers-1.6.jar +0 -0
- data/target/dependency/unidataCommon-4.2.20.jar +0 -0
- data/target/dependency/vorbis-java-core-0.6.jar +0 -0
- data/target/dependency/vorbis-java-tika-0.6.jar +0 -0
- data/target/dependency/xercesImpl-2.8.1.jar +0 -0
- data/target/dependency/xml-apis-1.3.03.jar +0 -0
- data/target/dependency/xmlbeans-2.6.0.jar +0 -0
- data/target/dependency/xmpcore-5.1.2.jar +0 -0
- data/target/dependency/xz-1.5.jar +0 -0
data/spec/fixtures/text_file.txt
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
First they ignore you, then they ridicule you, then they fight you, then you win.
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
First they ignore you, then they ridicule you, then they fight you, then you win.
|
data/spec/rika_spec.rb
DELETED
|
@@ -1,202 +0,0 @@
|
|
|
1
|
-
# encoding: utf-8
|
|
2
|
-
|
|
3
|
-
require 'spec_helper'
|
|
4
|
-
require 'webrick'
|
|
5
|
-
|
|
6
|
-
include WEBrick
|
|
7
|
-
|
|
8
|
-
describe Rika::Parser do
|
|
9
|
-
before(:all) do
|
|
10
|
-
@txt_parser = Rika::Parser.new(file_path("text_file.txt"))
|
|
11
|
-
@docx_parser = Rika::Parser.new(file_path("document.docx"))
|
|
12
|
-
@doc_parser = Rika::Parser.new(file_path("document.doc"))
|
|
13
|
-
@pdf_parser = Rika::Parser.new(file_path("document.pdf"))
|
|
14
|
-
@image_parser = Rika::Parser.new(file_path("image.jpg"))
|
|
15
|
-
@unknown_parser = Rika::Parser.new(file_path("unknown.bin"))
|
|
16
|
-
@dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
|
|
17
|
-
port = 50515
|
|
18
|
-
@url = "http://#{Socket.gethostname}:#{port}"
|
|
19
|
-
@quote = "First they ignore you, then they ridicule you, then they fight you, then you win."
|
|
20
|
-
@t1 = Thread.new do
|
|
21
|
-
@server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
|
|
22
|
-
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
|
23
|
-
@server.start
|
|
24
|
-
end
|
|
25
|
-
@sample_pdf_filespec = file_path("document.pdf")
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
after(:all) do
|
|
29
|
-
@t1.exit
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
it "should raise error if file does not exists" do
|
|
33
|
-
lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError)
|
|
34
|
-
end
|
|
35
|
-
|
|
36
|
-
it "should raise error if URL does not exists" do
|
|
37
|
-
lambda { Rika::Parser.new("http://nonsense.com/whatever.pdf") }.should raise_error(IOError)
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
it "should detect file type without a file extension" do
|
|
41
|
-
parser = Rika::Parser.new(file_path("text_file_without_extension"))
|
|
42
|
-
parser.metadata["Content-Type"].should == "text/plain; charset=ISO-8859-1"
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
it "should not be possible to trick the parser to read a folder with an extension" do
|
|
46
|
-
lambda { Rika::Parser.new(file_path("folder.js")).content }.should raise_error(IOError)
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
describe '#content' do
|
|
50
|
-
it "should return the content in a text file" do
|
|
51
|
-
@txt_parser.content.strip.should == @quote
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
it "should return the content in a docx file" do
|
|
55
|
-
@docx_parser.content.should == @quote
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
it "should return the content in a pdf file" do
|
|
59
|
-
@pdf_parser.content.should == @quote
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
it "should return no content for an image" do
|
|
63
|
-
@image_parser.content.should be_empty
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
it "should only return max content length" do
|
|
67
|
-
parser = Rika::Parser.new(file_path("text_file.txt"), 5)
|
|
68
|
-
parser.content.should == "First"
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
it "should only return max content length for file over http" do
|
|
72
|
-
parser = Rika::Parser.new(@url + "/document.pdf", 6)
|
|
73
|
-
parser.content.should == "First"
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
it "should be possible to read files over 100k by default" do
|
|
77
|
-
parser = Rika::Parser.new(file_path("over_100k_file.txt"))
|
|
78
|
-
parser.content.length.should == 101_761
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
it "should return the content from a file over http" do
|
|
82
|
-
parser = Rika::Parser.new(@url + "/document.pdf")
|
|
83
|
-
parser.content.should == @quote
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
it "should return empty string for unknown file" do
|
|
87
|
-
@unknown_parser.content.should be_empty
|
|
88
|
-
end
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
# We just test a few of the metadata fields for some common file formats
|
|
92
|
-
# to make sure the integration with Apache Tika works. Apache Tika already
|
|
93
|
-
# have tests for all file formats it supports so we won't retest that
|
|
94
|
-
describe '#metadata' do
|
|
95
|
-
it "should return nil if metadata field does not exists" do
|
|
96
|
-
@txt_parser.metadata["nonsense"].should be_nil
|
|
97
|
-
end
|
|
98
|
-
|
|
99
|
-
it "should return metadata from a docx file" do
|
|
100
|
-
@docx_parser.metadata["Page-Count"].should == "1"
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
it "should return metadata from a pdf file" do
|
|
104
|
-
@pdf_parser.metadata["title"].should == "A simple title"
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
it "should return metadata from a file over http" do
|
|
108
|
-
parser = Rika::Parser.new(@url + "/document.pdf")
|
|
109
|
-
parser.metadata["title"].should == "A simple title"
|
|
110
|
-
end
|
|
111
|
-
|
|
112
|
-
it "should return metadata from an image" do
|
|
113
|
-
@image_parser.metadata["Image Height"].should == "72 pixels"
|
|
114
|
-
@image_parser.metadata["Image Width"].should == "72 pixels"
|
|
115
|
-
end
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
describe '#available_metadata' do
|
|
119
|
-
it "should return available metadata fields" do
|
|
120
|
-
@txt_parser.available_metadata.should_not be_empty
|
|
121
|
-
end
|
|
122
|
-
|
|
123
|
-
it "should be an array" do
|
|
124
|
-
@txt_parser.available_metadata.is_a?(Array).should == true
|
|
125
|
-
end
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
describe '#metadata_exists?' do
|
|
129
|
-
it "should return false if metadata does not exists" do
|
|
130
|
-
@txt_parser.metadata_exists?("title").should == false
|
|
131
|
-
end
|
|
132
|
-
|
|
133
|
-
it "should return true if metadata exists" do
|
|
134
|
-
@docx_parser.metadata_exists?("title").should == true
|
|
135
|
-
end
|
|
136
|
-
end
|
|
137
|
-
|
|
138
|
-
describe '#media_type' do
|
|
139
|
-
it "should return application/pdf for a pdf file" do
|
|
140
|
-
@pdf_parser.media_type.should == "application/pdf"
|
|
141
|
-
end
|
|
142
|
-
|
|
143
|
-
it "should return text/plain for a txt file" do
|
|
144
|
-
@txt_parser.media_type.should == "text/plain"
|
|
145
|
-
end
|
|
146
|
-
|
|
147
|
-
it "should return application/pdf for a pdf over http" do
|
|
148
|
-
parser = Rika::Parser.new(@url + "/document.pdf")
|
|
149
|
-
parser.media_type.should == "application/pdf"
|
|
150
|
-
end
|
|
151
|
-
|
|
152
|
-
it "should return application/octet-stream for unknown file" do
|
|
153
|
-
@unknown_parser.media_type.should == "application/octet-stream"
|
|
154
|
-
end
|
|
155
|
-
|
|
156
|
-
it "should return msword for a doc file" do
|
|
157
|
-
@doc_parser.media_type.should == "application/msword"
|
|
158
|
-
end
|
|
159
|
-
|
|
160
|
-
it "should return wordprocessingml for a docx file" do
|
|
161
|
-
@docx_parser.media_type.should == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
162
|
-
end
|
|
163
|
-
end
|
|
164
|
-
|
|
165
|
-
describe '#language' do
|
|
166
|
-
it "should return the language of the content" do
|
|
167
|
-
|
|
168
|
-
["en", "de", "fr", "ru", "es"].each do |lang|
|
|
169
|
-
txt = Rika::Parser.new(file_path("#{lang}.txt"))
|
|
170
|
-
txt.language.should == lang
|
|
171
|
-
end
|
|
172
|
-
end
|
|
173
|
-
end
|
|
174
|
-
|
|
175
|
-
describe '#language_is_reasonably_certain?' do
|
|
176
|
-
it "should return false if lang can't be determined" do
|
|
177
|
-
lang = Rika::Parser.new(file_path("lang_cant_be_determined.txt"))
|
|
178
|
-
lang.language_is_reasonably_certain? == false
|
|
179
|
-
end
|
|
180
|
-
|
|
181
|
-
it "should return true if language can be determined" do
|
|
182
|
-
lang = Rika::Parser.new(file_path("en.txt"))
|
|
183
|
-
lang.language_is_reasonably_certain? == true
|
|
184
|
-
end
|
|
185
|
-
end
|
|
186
|
-
|
|
187
|
-
it "should return valid content using Rika.parse_content" do
|
|
188
|
-
content = Rika.parse_content(@sample_pdf_filespec)
|
|
189
|
-
(content.should be_a(String)) && (content.should_not be_empty)
|
|
190
|
-
end
|
|
191
|
-
|
|
192
|
-
it "should return valid metadata using Rika.parse_metadata" do
|
|
193
|
-
metadata = Rika.parse_metadata(@sample_pdf_filespec)
|
|
194
|
-
(metadata.should be_a(Hash)) && (metadata.should_not be_empty)
|
|
195
|
-
end
|
|
196
|
-
|
|
197
|
-
it "should return valid content and metadata using Rika.parse_content_and_metadata" do
|
|
198
|
-
content, metadata = Rika.parse_content_and_metadata(@sample_pdf_filespec)
|
|
199
|
-
(content.should be_a(String)) && (content.should_not be_empty) && \
|
|
200
|
-
(metadata.should be_a(Hash)) && (metadata.should_not be_empty)
|
|
201
|
-
end
|
|
202
|
-
end
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|