rika 1.6.0-java → 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +6 -4
  3. data/.rubocop.yml +49 -0
  4. data/Gemfile +12 -0
  5. data/README.md +226 -47
  6. data/RELEASE_NOTES.md +43 -0
  7. data/Rakefile +4 -7
  8. data/bin/rika +13 -0
  9. data/lib/rika/cli/args_parser.rb +131 -0
  10. data/lib/rika/cli/rika_command.rb +129 -0
  11. data/lib/rika/formatters.rb +39 -0
  12. data/lib/rika/parse_result.rb +34 -0
  13. data/lib/rika/parser.rb +84 -0
  14. data/lib/rika/tika_loader.rb +65 -0
  15. data/lib/rika/version.rb +3 -1
  16. data/lib/rika.rb +96 -104
  17. data/pom.xml +2 -2
  18. data/rika.gemspec +30 -15
  19. data/rika_helper.rb +30 -0
  20. data/spec/fixtures/de.txt +21 -1
  21. data/spec/fixtures/document.doc +0 -0
  22. data/spec/fixtures/document.docx +0 -0
  23. data/spec/fixtures/document.pdf +0 -0
  24. data/spec/fixtures/document.txt +23 -0
  25. data/spec/fixtures/en.txt +23 -1
  26. data/spec/fixtures/es.txt +21 -1
  27. data/spec/fixtures/fr.txt +23 -1
  28. data/spec/fixtures/image_jpg_without_extension +0 -0
  29. data/spec/fixtures/ru.txt +21 -1
  30. data/spec/fixtures/tiny.txt +1 -0
  31. data/spec/rika/cli/args_parser_spec.rb +117 -0
  32. data/spec/rika/cli/rika_command_spec.rb +120 -0
  33. data/spec/rika/formatters_spec.rb +23 -0
  34. data/spec/rika/parse_result_spec.rb +42 -0
  35. data/spec/rika/parser_spec.rb +304 -0
  36. data/spec/rika/rika_spec.rb +10 -0
  37. data/spec/rika/tika_loader_spec.rb +57 -0
  38. data/spec/spec_helper.rb +13 -5
  39. metadata +54 -98
  40. data/.travis.yml +0 -7
  41. data/spec/fixtures/over_100k_file.txt +0 -1241
  42. data/spec/fixtures/text_file.txt +0 -1
  43. data/spec/fixtures/text_file_without_extension +0 -1
  44. data/spec/rika_spec.rb +0 -202
  45. data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
  46. data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
  47. data/target/dependency/asm-debug-all-4.1.jar +0 -0
  48. data/target/dependency/aspectjrt-1.8.0.jar +0 -0
  49. data/target/dependency/bcmail-jdk15-1.45.jar +0 -0
  50. data/target/dependency/bcprov-jdk15-1.45.jar +0 -0
  51. data/target/dependency/boilerpipe-1.1.0.jar +0 -0
  52. data/target/dependency/commons-codec-1.9.jar +0 -0
  53. data/target/dependency/commons-compress-1.8.1.jar +0 -0
  54. data/target/dependency/commons-httpclient-3.1.jar +0 -0
  55. data/target/dependency/commons-logging-1.1.1.jar +0 -0
  56. data/target/dependency/fontbox-1.8.6.jar +0 -0
  57. data/target/dependency/isoparser-1.0.2.jar +0 -0
  58. data/target/dependency/java-libpst-0.8.1.jar +0 -0
  59. data/target/dependency/jcip-annotations-1.0.jar +0 -0
  60. data/target/dependency/jdom-1.0.jar +0 -0
  61. data/target/dependency/jempbox-1.8.6.jar +0 -0
  62. data/target/dependency/jhighlight-1.0.jar +0 -0
  63. data/target/dependency/jmatio-1.0.jar +0 -0
  64. data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
  65. data/target/dependency/metadata-extractor-2.6.2.jar +0 -0
  66. data/target/dependency/netcdf-4.2.20.jar +0 -0
  67. data/target/dependency/pdfbox-1.8.6.jar +0 -0
  68. data/target/dependency/poi-3.11-beta2.jar +0 -0
  69. data/target/dependency/poi-ooxml-3.11-beta2.jar +0 -0
  70. data/target/dependency/poi-ooxml-schemas-3.11-beta2.jar +0 -0
  71. data/target/dependency/poi-scratchpad-3.11-beta2.jar +0 -0
  72. data/target/dependency/rome-1.0.jar +0 -0
  73. data/target/dependency/slf4j-api-1.6.1.jar +0 -0
  74. data/target/dependency/tagsoup-1.2.1.jar +0 -0
  75. data/target/dependency/tika-core-1.6.jar +0 -0
  76. data/target/dependency/tika-parsers-1.6.jar +0 -0
  77. data/target/dependency/unidataCommon-4.2.20.jar +0 -0
  78. data/target/dependency/vorbis-java-core-0.6.jar +0 -0
  79. data/target/dependency/vorbis-java-tika-0.6.jar +0 -0
  80. data/target/dependency/xercesImpl-2.8.1.jar +0 -0
  81. data/target/dependency/xml-apis-1.3.03.jar +0 -0
  82. data/target/dependency/xmlbeans-2.6.0.jar +0 -0
  83. data/target/dependency/xmpcore-5.1.2.jar +0 -0
  84. data/target/dependency/xz-1.5.jar +0 -0
@@ -1 +0,0 @@
1
- First they ignore you, then they ridicule you, then they fight you, then you win.
@@ -1 +0,0 @@
1
- First they ignore you, then they ridicule you, then they fight you, then you win.
data/spec/rika_spec.rb DELETED
@@ -1,202 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require 'spec_helper'
4
- require 'webrick'
5
-
6
- include WEBrick
7
-
8
- describe Rika::Parser do
9
- before(:all) do
10
- @txt_parser = Rika::Parser.new(file_path("text_file.txt"))
11
- @docx_parser = Rika::Parser.new(file_path("document.docx"))
12
- @doc_parser = Rika::Parser.new(file_path("document.doc"))
13
- @pdf_parser = Rika::Parser.new(file_path("document.pdf"))
14
- @image_parser = Rika::Parser.new(file_path("image.jpg"))
15
- @unknown_parser = Rika::Parser.new(file_path("unknown.bin"))
16
- @dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
17
- port = 50515
18
- @url = "http://#{Socket.gethostname}:#{port}"
19
- @quote = "First they ignore you, then they ridicule you, then they fight you, then you win."
20
- @t1 = Thread.new do
21
- @server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
22
- :AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
23
- @server.start
24
- end
25
- @sample_pdf_filespec = file_path("document.pdf")
26
- end
27
-
28
- after(:all) do
29
- @t1.exit
30
- end
31
-
32
- it "should raise error if file does not exists" do
33
- lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError)
34
- end
35
-
36
- it "should raise error if URL does not exists" do
37
- lambda { Rika::Parser.new("http://nonsense.com/whatever.pdf") }.should raise_error(IOError)
38
- end
39
-
40
- it "should detect file type without a file extension" do
41
- parser = Rika::Parser.new(file_path("text_file_without_extension"))
42
- parser.metadata["Content-Type"].should == "text/plain; charset=ISO-8859-1"
43
- end
44
-
45
- it "should not be possible to trick the parser to read a folder with an extension" do
46
- lambda { Rika::Parser.new(file_path("folder.js")).content }.should raise_error(IOError)
47
- end
48
-
49
- describe '#content' do
50
- it "should return the content in a text file" do
51
- @txt_parser.content.strip.should == @quote
52
- end
53
-
54
- it "should return the content in a docx file" do
55
- @docx_parser.content.should == @quote
56
- end
57
-
58
- it "should return the content in a pdf file" do
59
- @pdf_parser.content.should == @quote
60
- end
61
-
62
- it "should return no content for an image" do
63
- @image_parser.content.should be_empty
64
- end
65
-
66
- it "should only return max content length" do
67
- parser = Rika::Parser.new(file_path("text_file.txt"), 5)
68
- parser.content.should == "First"
69
- end
70
-
71
- it "should only return max content length for file over http" do
72
- parser = Rika::Parser.new(@url + "/document.pdf", 6)
73
- parser.content.should == "First"
74
- end
75
-
76
- it "should be possible to read files over 100k by default" do
77
- parser = Rika::Parser.new(file_path("over_100k_file.txt"))
78
- parser.content.length.should == 101_761
79
- end
80
-
81
- it "should return the content from a file over http" do
82
- parser = Rika::Parser.new(@url + "/document.pdf")
83
- parser.content.should == @quote
84
- end
85
-
86
- it "should return empty string for unknown file" do
87
- @unknown_parser.content.should be_empty
88
- end
89
- end
90
-
91
- # We just test a few of the metadata fields for some common file formats
92
- # to make sure the integration with Apache Tika works. Apache Tika already
93
- # have tests for all file formats it supports so we won't retest that
94
- describe '#metadata' do
95
- it "should return nil if metadata field does not exists" do
96
- @txt_parser.metadata["nonsense"].should be_nil
97
- end
98
-
99
- it "should return metadata from a docx file" do
100
- @docx_parser.metadata["Page-Count"].should == "1"
101
- end
102
-
103
- it "should return metadata from a pdf file" do
104
- @pdf_parser.metadata["title"].should == "A simple title"
105
- end
106
-
107
- it "should return metadata from a file over http" do
108
- parser = Rika::Parser.new(@url + "/document.pdf")
109
- parser.metadata["title"].should == "A simple title"
110
- end
111
-
112
- it "should return metadata from an image" do
113
- @image_parser.metadata["Image Height"].should == "72 pixels"
114
- @image_parser.metadata["Image Width"].should == "72 pixels"
115
- end
116
- end
117
-
118
- describe '#available_metadata' do
119
- it "should return available metadata fields" do
120
- @txt_parser.available_metadata.should_not be_empty
121
- end
122
-
123
- it "should be an array" do
124
- @txt_parser.available_metadata.is_a?(Array).should == true
125
- end
126
- end
127
-
128
- describe '#metadata_exists?' do
129
- it "should return false if metadata does not exists" do
130
- @txt_parser.metadata_exists?("title").should == false
131
- end
132
-
133
- it "should return true if metadata exists" do
134
- @docx_parser.metadata_exists?("title").should == true
135
- end
136
- end
137
-
138
- describe '#media_type' do
139
- it "should return application/pdf for a pdf file" do
140
- @pdf_parser.media_type.should == "application/pdf"
141
- end
142
-
143
- it "should return text/plain for a txt file" do
144
- @txt_parser.media_type.should == "text/plain"
145
- end
146
-
147
- it "should return application/pdf for a pdf over http" do
148
- parser = Rika::Parser.new(@url + "/document.pdf")
149
- parser.media_type.should == "application/pdf"
150
- end
151
-
152
- it "should return application/octet-stream for unknown file" do
153
- @unknown_parser.media_type.should == "application/octet-stream"
154
- end
155
-
156
- it "should return msword for a doc file" do
157
- @doc_parser.media_type.should == "application/msword"
158
- end
159
-
160
- it "should return wordprocessingml for a docx file" do
161
- @docx_parser.media_type.should == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
162
- end
163
- end
164
-
165
- describe '#language' do
166
- it "should return the language of the content" do
167
-
168
- ["en", "de", "fr", "ru", "es"].each do |lang|
169
- txt = Rika::Parser.new(file_path("#{lang}.txt"))
170
- txt.language.should == lang
171
- end
172
- end
173
- end
174
-
175
- describe '#language_is_reasonably_certain?' do
176
- it "should return false if lang can't be determined" do
177
- lang = Rika::Parser.new(file_path("lang_cant_be_determined.txt"))
178
- lang.language_is_reasonably_certain? == false
179
- end
180
-
181
- it "should return true if language can be determined" do
182
- lang = Rika::Parser.new(file_path("en.txt"))
183
- lang.language_is_reasonably_certain? == true
184
- end
185
- end
186
-
187
- it "should return valid content using Rika.parse_content" do
188
- content = Rika.parse_content(@sample_pdf_filespec)
189
- (content.should be_a(String)) && (content.should_not be_empty)
190
- end
191
-
192
- it "should return valid metadata using Rika.parse_metadata" do
193
- metadata = Rika.parse_metadata(@sample_pdf_filespec)
194
- (metadata.should be_a(Hash)) && (metadata.should_not be_empty)
195
- end
196
-
197
- it "should return valid content and metadata using Rika.parse_content_and_metadata" do
198
- content, metadata = Rika.parse_content_and_metadata(@sample_pdf_filespec)
199
- (content.should be_a(String)) && (content.should_not be_empty) && \
200
- (metadata.should be_a(Hash)) && (metadata.should_not be_empty)
201
- end
202
- end
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file