rika-stevedore 1.1.4-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (167) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +21 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +7 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +92 -0
  8. data/Rakefile +11 -0
  9. data/lib/rika/version.rb +3 -0
  10. data/lib/rika.rb +129 -0
  11. data/pom.xml +20 -0
  12. data/rika-stevedore.gemspec +21 -0
  13. data/spec/fixtures/de.txt +1 -0
  14. data/spec/fixtures/document.doc +0 -0
  15. data/spec/fixtures/document.docx +0 -0
  16. data/spec/fixtures/document.pdf +0 -0
  17. data/spec/fixtures/en.txt +1 -0
  18. data/spec/fixtures/es.txt +1 -0
  19. data/spec/fixtures/fr.txt +1 -0
  20. data/spec/fixtures/image.jpg +0 -0
  21. data/spec/fixtures/lang_cant_be_determined.txt +1 -0
  22. data/spec/fixtures/over_100k_file.txt +1241 -0
  23. data/spec/fixtures/ru.txt +1 -0
  24. data/spec/fixtures/text_file.txt +1 -0
  25. data/spec/fixtures/text_file_without_extension +1 -0
  26. data/spec/fixtures/unknown.bin +0 -0
  27. data/spec/rika_spec.rb +203 -0
  28. data/spec/spec_helper.rb +14 -0
  29. data/target/dependency/aopalliance-1.0.jar +0 -0
  30. data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
  31. data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
  32. data/target/dependency/asm-5.0.4.jar +0 -0
  33. data/target/dependency/bcmail-jdk15on-1.54.jar +0 -0
  34. data/target/dependency/bcpkix-jdk15on-1.54.jar +0 -0
  35. data/target/dependency/bcprov-jdk15on-1.54.jar +0 -0
  36. data/target/dependency/bndlib-1.43.0.jar +0 -0
  37. data/target/dependency/boilerpipe-1.1.0.jar +0 -0
  38. data/target/dependency/bzip2-0.9.1.jar +0 -0
  39. data/target/dependency/c3p0-0.9.1.1.jar +0 -0
  40. data/target/dependency/cdm-4.5.5.jar +0 -0
  41. data/target/dependency/cleartk-util-2.0.0.jar +0 -0
  42. data/target/dependency/commons-codec-1.10.jar +0 -0
  43. data/target/dependency/commons-collections4-4.1.jar +0 -0
  44. data/target/dependency/commons-compress-1.12.jar +0 -0
  45. data/target/dependency/commons-csv-1.0.jar +0 -0
  46. data/target/dependency/commons-exec-1.3.jar +0 -0
  47. data/target/dependency/commons-io-2.5.jar +0 -0
  48. data/target/dependency/commons-lang-2.6.jar +0 -0
  49. data/target/dependency/commons-logging-1.1.3.jar +0 -0
  50. data/target/dependency/commons-logging-api-1.1.jar +0 -0
  51. data/target/dependency/commons-vfs2-2.0.jar +0 -0
  52. data/target/dependency/ctakes-core-3.2.2.jar +0 -0
  53. data/target/dependency/ctakes-core-res-3.2.2.jar +0 -0
  54. data/target/dependency/ctakes-type-system-3.2.2.jar +0 -0
  55. data/target/dependency/ctakes-utils-3.2.2.jar +0 -0
  56. data/target/dependency/curvesapi-1.04.jar +0 -0
  57. data/target/dependency/cxf-core-3.0.3.jar +0 -0
  58. data/target/dependency/cxf-rt-frontend-jaxrs-3.0.3.jar +0 -0
  59. data/target/dependency/cxf-rt-rs-client-3.0.3.jar +0 -0
  60. data/target/dependency/cxf-rt-transports-http-3.0.3.jar +0 -0
  61. data/target/dependency/ehcache-core-2.6.2.jar +0 -0
  62. data/target/dependency/findstructapi-0.0.1.jar +0 -0
  63. data/target/dependency/fontbox-2.0.3.jar +0 -0
  64. data/target/dependency/geoapi-3.0.0.jar +0 -0
  65. data/target/dependency/grib-4.5.5.jar +0 -0
  66. data/target/dependency/gson-2.2.4.jar +0 -0
  67. data/target/dependency/guava-17.0.jar +0 -0
  68. data/target/dependency/hamcrest-core-1.3.jar +0 -0
  69. data/target/dependency/httpclient-4.5.2.jar +0 -0
  70. data/target/dependency/httpcore-4.4.4.jar +0 -0
  71. data/target/dependency/httpmime-4.5.2.jar +0 -0
  72. data/target/dependency/httpservices-4.5.5.jar +0 -0
  73. data/target/dependency/isoparser-1.1.18.jar +0 -0
  74. data/target/dependency/jVinci-2.4.0.jar +0 -0
  75. data/target/dependency/jackcess-2.1.4.jar +0 -0
  76. data/target/dependency/jackcess-encrypt-2.1.1.jar +0 -0
  77. data/target/dependency/jackson-core-2.8.1.jar +0 -0
  78. data/target/dependency/jai-imageio-core-1.3.1.jar +0 -0
  79. data/target/dependency/jakarta-regexp-1.4.jar +0 -0
  80. data/target/dependency/java-libpst-0.8.1.jar +0 -0
  81. data/target/dependency/javax.annotation-api-1.2.jar +0 -0
  82. data/target/dependency/javax.ws.rs-api-2.0.1.jar +0 -0
  83. data/target/dependency/jcip-annotations-1.0.jar +0 -0
  84. data/target/dependency/jcommander-1.35.jar +0 -0
  85. data/target/dependency/jdom-1.0.jar +0 -0
  86. data/target/dependency/jdom2-2.0.4.jar +0 -0
  87. data/target/dependency/jempbox-1.8.12.jar +0 -0
  88. data/target/dependency/jhighlight-1.0.2.jar +0 -0
  89. data/target/dependency/jj2000-5.2.jar +0 -0
  90. data/target/dependency/jmatio-1.2.jar +0 -0
  91. data/target/dependency/jna-4.1.0.jar +0 -0
  92. data/target/dependency/joda-time-2.2.jar +0 -0
  93. data/target/dependency/json-20140107.jar +0 -0
  94. data/target/dependency/json-simple-1.1.1.jar +0 -0
  95. data/target/dependency/jsoup-1.7.2.jar +0 -0
  96. data/target/dependency/jsr-275-0.9.3.jar +0 -0
  97. data/target/dependency/junit-4.11.jar +0 -0
  98. data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
  99. data/target/dependency/junrar-0.7.jar +0 -0
  100. data/target/dependency/jwnl-1.3.3.jar +0 -0
  101. data/target/dependency/libsvm-3.1.jar +0 -0
  102. data/target/dependency/lucene-analyzers-common-4.0.0.jar +0 -0
  103. data/target/dependency/lucene-core-4.0.0.jar +0 -0
  104. data/target/dependency/lucene-queries-4.0.0.jar +0 -0
  105. data/target/dependency/lucene-queryparser-4.0.0.jar +0 -0
  106. data/target/dependency/lucene-sandbox-4.0.0.jar +0 -0
  107. data/target/dependency/maven-scm-api-1.4.jar +0 -0
  108. data/target/dependency/maven-scm-provider-svn-commons-1.4.jar +0 -0
  109. data/target/dependency/maven-scm-provider-svnexe-1.4.jar +0 -0
  110. data/target/dependency/metadata-extractor-2.8.1.jar +0 -0
  111. data/target/dependency/mockito-core-1.7.jar +0 -0
  112. data/target/dependency/netcdf4-4.5.5.jar +0 -0
  113. data/target/dependency/objenesis-1.0.jar +0 -0
  114. data/target/dependency/openaifsm-0.0.1.jar +0 -0
  115. data/target/dependency/opennlp-maxent-3.0.3.jar +0 -0
  116. data/target/dependency/opennlp-tools-1.5.3.jar +0 -0
  117. data/target/dependency/org.apache.felix.scr.annotations-1.6.0.jar +0 -0
  118. data/target/dependency/org.osgi.compendium-4.0.0.jar +0 -0
  119. data/target/dependency/org.osgi.core-4.0.0.jar +0 -0
  120. data/target/dependency/pdfbox-2.0.3.jar +0 -0
  121. data/target/dependency/pdfbox-debugger-2.0.3.jar +0 -0
  122. data/target/dependency/pdfbox-tools-2.0.3.jar +0 -0
  123. data/target/dependency/plexus-utils-1.5.6.jar +0 -0
  124. data/target/dependency/poi-3.15.jar +0 -0
  125. data/target/dependency/poi-ooxml-3.15.jar +0 -0
  126. data/target/dependency/poi-ooxml-schemas-3.15.jar +0 -0
  127. data/target/dependency/poi-scratchpad-3.15.jar +0 -0
  128. data/target/dependency/protobuf-java-2.5.0.jar +0 -0
  129. data/target/dependency/quartz-2.2.0.jar +0 -0
  130. data/target/dependency/regexp-1.3.jar +0 -0
  131. data/target/dependency/rome-1.5.1.jar +0 -0
  132. data/target/dependency/rome-utils-1.5.1.jar +0 -0
  133. data/target/dependency/sis-metadata-0.6.jar +0 -0
  134. data/target/dependency/sis-netcdf-0.6.jar +0 -0
  135. data/target/dependency/sis-referencing-0.6.jar +0 -0
  136. data/target/dependency/sis-storage-0.6.jar +0 -0
  137. data/target/dependency/sis-utility-0.6.jar +0 -0
  138. data/target/dependency/slf4j-api-1.7.12.jar +0 -0
  139. data/target/dependency/slf4j-log4j12-1.7.12.jar +0 -0
  140. data/target/dependency/spring-aop-3.1.2.RELEASE.jar +0 -0
  141. data/target/dependency/spring-asm-3.1.2.RELEASE.jar +0 -0
  142. data/target/dependency/spring-beans-3.1.2.RELEASE.jar +0 -0
  143. data/target/dependency/spring-context-3.1.2.RELEASE.jar +0 -0
  144. data/target/dependency/spring-core-3.1.2.RELEASE.jar +0 -0
  145. data/target/dependency/spring-expression-3.1.2.RELEASE.jar +0 -0
  146. data/target/dependency/sqlite-jdbc-3.8.11.2.jar +0 -0
  147. data/target/dependency/sqlwrapper-0.0.1.jar +0 -0
  148. data/target/dependency/stax2-api-3.1.4.jar +0 -0
  149. data/target/dependency/tagsoup-1.2.1.jar +0 -0
  150. data/target/dependency/tika-core-1.14.jar +0 -0
  151. data/target/dependency/tika-parsers-1.14.jar +0 -0
  152. data/target/dependency/udunits-4.5.5.jar +0 -0
  153. data/target/dependency/uimafit-core-2.1.0.jar +0 -0
  154. data/target/dependency/uimaj-adapter-vinci-2.4.0.jar +0 -0
  155. data/target/dependency/uimaj-core-2.4.0.jar +0 -0
  156. data/target/dependency/uimaj-cpe-2.4.0.jar +0 -0
  157. data/target/dependency/uimaj-document-annotation-2.4.0.jar +0 -0
  158. data/target/dependency/uimaj-examples-2.4.0.jar +0 -0
  159. data/target/dependency/uimaj-tools-2.6.0.jar +0 -0
  160. data/target/dependency/vorbis-java-core-0.8.jar +0 -0
  161. data/target/dependency/vorbis-java-tika-0.8.jar +0 -0
  162. data/target/dependency/woodstox-core-asl-4.4.1.jar +0 -0
  163. data/target/dependency/xmlbeans-2.6.0.jar +0 -0
  164. data/target/dependency/xmlschema-core-2.1.0.jar +0 -0
  165. data/target/dependency/xmpcore-5.1.2.jar +0 -0
  166. data/target/dependency/xz-1.5.jar +0 -0
  167. metadata +254 -0
@@ -0,0 +1 @@
1
+ Любя, съешь щипцы, — вздохнёт мэр, — кайф жгуч. Шеф взъярён тчк щипцы с эхом гудбай Жюль. Эй, жлоб! Где туз? Прячь юных съёмщиц в шкаф. Экс-граф? Плюш изъят. Бьём чуждый цен хвощ! Эх, чужак! Общий съём цен шляп (юфть) — вдрызг! Любя, съешь щипцы, — вздохнёт мэр, — кайф жгуч. Шеф взъярён тчк щипцы с эхом гудбай Жюль. Эй, жлоб! Где туз? Прячь юных съёмщиц в шкаф. Экс-граф? Плюш изъят. Бьём чуждый цен хвощ! Эх, чужак! Общий съём цен шляп (юфть) — вдрызг! Любя, съешь щипцы, — вздохнёт мэр, — кайф жгуч. Шеф взъярён тчк щипцы с эхом гудбай Жюль. Эй, жлоб! Где туз? Прячь юных съёмщиц в шкаф. Экс-граф? Плюш изъят. Бьём чуждый цен хвощ! Эх, чужак! Общий съём цен шляп (юфть) — вдрызг! Любя, съешь щипцы, — вздохнёт мэр, — кайф жгуч. Шеф взъярён тчк щипцы с эхом гудбай Жюль. Эй, жлоб! Где туз? Прячь юных съёмщиц в шкаф. Экс-граф? Плюш изъят. Бьём чуждый цен хвощ! Эх, чужак! Общий съём цен шляп (юфть) — вдрызг!Любя, съешь щипцы, — вздохнёт мэр, — кайф жгуч. Шеф взъярён тчк щипцы с эхом гудбай Жюль. Эй, жлоб! Где туз? Прячь юных съёмщиц в шкаф. Экс-граф? Плюш изъят. Бьём чуждый цен
@@ -0,0 +1 @@
1
+ First they ignore you, then they ridicule you, then they fight you, then you win.
@@ -0,0 +1 @@
1
+ First they ignore you, then they ridicule you, then they fight you, then you win.
Binary file
data/spec/rika_spec.rb ADDED
@@ -0,0 +1,203 @@
1
+ # encoding: utf-8
2
+
3
+ require 'spec_helper'
4
+ require 'webrick'
5
+
6
+ include WEBrick
7
+
8
+ describe Rika::Parser do
9
+ before(:all) do
10
+ @txt_parser = Rika::Parser.new(file_path("text_file.txt"))
11
+ @docx_parser = Rika::Parser.new(file_path("document.docx"))
12
+ @doc_parser = Rika::Parser.new(file_path("document.doc"))
13
+ @pdf_parser = Rika::Parser.new(file_path("document.pdf"))
14
+ @image_parser = Rika::Parser.new(file_path("image.jpg"))
15
+ @unknown_parser = Rika::Parser.new(file_path("unknown.bin"))
16
+ @dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
17
+ port = 50515
18
+ @url = "http://#{Socket.gethostname}:#{port}"
19
+ @quote = "First they ignore you, then they ridicule you, then they fight you, then you win."
20
+ @t1 = Thread.new do
21
+ @server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
22
+ :AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
23
+ @server.start
24
+ end
25
+ @sample_pdf_filespec = file_path("document.pdf")
26
+ end
27
+
28
+ after(:all) do
29
+ @t1.exit
30
+ end
31
+
32
+ it "should raise error if file does not exists" do
33
+ lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError)
34
+ end
35
+
36
+ it "should raise error if URL does not exists" do
37
+ lambda { Rika::Parser.new("http://rika.clearly-non-existent.github.com/whatever.pdf").content }.should raise_error(java.io.FileNotFoundException)
38
+ end
39
+
40
+ it "should detect file type without a file extension" do
41
+ parser = Rika::Parser.new(file_path("text_file_without_extension"))
42
+ parser.metadata["Content-Type"].should == "text/plain; charset=ISO-8859-1"
43
+ end
44
+
45
+ it "should not be possible to trick the parser to read a folder with an extension" do
46
+ lambda { Rika::Parser.new(file_path("folder.js")).content }.should raise_error(IOError)
47
+ end
48
+
49
+ describe '#content' do
50
+ it "should return the content in a text file" do
51
+ @txt_parser.content.strip.should == @quote
52
+ end
53
+
54
+ it "should return the content in a docx file" do
55
+ @docx_parser.content.should == @quote
56
+ end
57
+
58
+ it "should return the content in a pdf file" do
59
+ @pdf_parser.content.should == @quote
60
+ end
61
+
62
+ it "should return no content for an image" do
63
+ @image_parser.content.should be_empty
64
+ end
65
+
66
+ it "should only return max content length" do
67
+ parser = Rika::Parser.new(file_path("text_file.txt"), 5)
68
+ parser.content.should == "First"
69
+ end
70
+
71
+ it "should only return max content length for file over http" do
72
+ parser = Rika::Parser.new(@url + "/document.pdf", 6)
73
+ parser.content.should == "First"
74
+ end
75
+
76
+ it "should be possible to read files over 100k by default" do
77
+ parser = Rika::Parser.new(file_path("over_100k_file.txt"))
78
+ parser.content.length.should == 101_761
79
+ end
80
+
81
+ it "should return the content from a file over http" do
82
+ parser = Rika::Parser.new(@url + "/document.pdf")
83
+ parser.content.should == @quote
84
+ end
85
+
86
+ it "should return empty string for unknown file" do
87
+ @unknown_parser.content.should be_empty
88
+ end
89
+ end
90
+
91
+ # We just test a few of the metadata fields for some common file formats
92
+ # to make sure the integration with Apache Tika works. Apache Tika already
93
+ # have tests for all file formats it supports so we won't retest that
94
+ describe '#metadata' do
95
+ it "should return nil if metadata field does not exists" do
96
+ @txt_parser.metadata["nonsense"].should be_nil
97
+ end
98
+
99
+ it "should return metadata from a docx file" do
100
+ @docx_parser.metadata["Page-Count"].should == "1"
101
+ end
102
+
103
+ it "should return metadata from a pdf file" do
104
+ @pdf_parser.metadata["title"].should == "A simple title"
105
+ end
106
+
107
+ it "should return metadata from a file over http" do
108
+ parser = Rika::Parser.new(@url + "/document.pdf")
109
+ parser.metadata["title"].should == "A simple title"
110
+ end
111
+
112
+ # TIKA appears to longer support this.
113
+ # it "should return metadata from an image" do
114
+ # @image_parser.metadata["Image Height"].should == "72 pixels"
115
+ # @image_parser.metadata["Image Width"].should == "72 pixels"
116
+ # end
117
+ end
118
+
119
+ describe '#available_metadata' do
120
+ it "should return available metadata fields" do
121
+ @txt_parser.available_metadata.should_not be_empty
122
+ end
123
+
124
+ it "should be an array" do
125
+ @txt_parser.available_metadata.is_a?(Array).should == true
126
+ end
127
+ end
128
+
129
+ describe '#metadata_exists?' do
130
+ it "should return false if metadata does not exists" do
131
+ @txt_parser.metadata_exists?("title").should == false
132
+ end
133
+
134
+ it "should return true if metadata exists" do
135
+ @docx_parser.metadata_exists?("title").should == true
136
+ end
137
+ end
138
+
139
+ describe '#media_type' do
140
+ it "should return application/pdf for a pdf file" do
141
+ @pdf_parser.media_type.should == "application/pdf"
142
+ end
143
+
144
+ it "should return text/plain for a txt file" do
145
+ @txt_parser.media_type.should == "text/plain"
146
+ end
147
+
148
+ it "should return application/pdf for a pdf over http" do
149
+ parser = Rika::Parser.new(@url + "/document.pdf")
150
+ parser.media_type.should == "application/pdf"
151
+ end
152
+
153
+ it "should return application/octet-stream for unknown file" do
154
+ @unknown_parser.media_type.should == "application/octet-stream"
155
+ end
156
+
157
+ it "should return msword for a doc file" do
158
+ @doc_parser.media_type.should == "application/msword"
159
+ end
160
+
161
+ it "should return wordprocessingml for a docx file" do
162
+ @docx_parser.media_type.should == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
163
+ end
164
+ end
165
+
166
+ describe '#language' do
167
+ it "should return the language of the content" do
168
+
169
+ ["en", "de", "fr", "ru", "es"].each do |lang|
170
+ txt = Rika::Parser.new(file_path("#{lang}.txt"))
171
+ txt.language.should == lang
172
+ end
173
+ end
174
+ end
175
+
176
+ describe '#language_is_reasonably_certain?' do
177
+ it "should return false if lang can't be determined" do
178
+ lang = Rika::Parser.new(file_path("lang_cant_be_determined.txt"))
179
+ lang.language_is_reasonably_certain? == false
180
+ end
181
+
182
+ it "should return true if language can be determined" do
183
+ lang = Rika::Parser.new(file_path("en.txt"))
184
+ lang.language_is_reasonably_certain? == true
185
+ end
186
+ end
187
+
188
+ it "should return valid content using Rika.parse_content" do
189
+ content = Rika.parse_content(@sample_pdf_filespec)
190
+ (content.should be_a(String)) && (content.should_not be_empty)
191
+ end
192
+
193
+ it "should return valid metadata using Rika.parse_metadata" do
194
+ metadata = Rika.parse_metadata(@sample_pdf_filespec)
195
+ (metadata.should be_a(Hash)) && (metadata.should_not be_empty)
196
+ end
197
+
198
+ it "should return valid content and metadata using Rika.parse_content_and_metadata" do
199
+ content, metadata = Rika.parse_content_and_metadata(@sample_pdf_filespec)
200
+ (content.should be_a(String)) && (content.should_not be_empty) && \
201
+ (metadata.should be_a(Hash)) && (metadata.should_not be_empty)
202
+ end
203
+ end
@@ -0,0 +1,14 @@
1
+ require "rika"
2
+
3
+ def file_path( *paths )
4
+ File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
5
+ end
6
+
7
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
8
+ RSpec.configure do |config|
9
+ config.treat_symbols_as_metadata_keys_with_true_values = true
10
+ config.run_all_when_everything_filtered = true
11
+ config.filter_run :focus
12
+
13
+ config.order = 'random'
14
+ end
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file