traject 1.1.0 → 2.0.0.rc.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +20 -0
  3. data/README.md +85 -73
  4. data/doc/batch_execution.md +2 -6
  5. data/doc/other_commands.md +3 -5
  6. data/doc/settings.md +27 -38
  7. data/lib/traject/command_line.rb +1 -1
  8. data/lib/traject/csv_writer.rb +34 -0
  9. data/lib/traject/delimited_writer.rb +110 -0
  10. data/lib/traject/indexer.rb +29 -11
  11. data/lib/traject/indexer/settings.rb +39 -13
  12. data/lib/traject/line_writer.rb +10 -6
  13. data/lib/traject/marc_reader.rb +2 -1
  14. data/lib/traject/solr_json_writer.rb +277 -0
  15. data/lib/traject/thread_pool.rb +38 -48
  16. data/lib/traject/translation_map.rb +3 -0
  17. data/lib/traject/util.rb +13 -51
  18. data/lib/traject/version.rb +1 -1
  19. data/lib/translation_maps/marc_geographic.yaml +2 -2
  20. data/test/delimited_writer_test.rb +104 -0
  21. data/test/indexer/read_write_test.rb +0 -22
  22. data/test/indexer/settings_test.rb +24 -0
  23. data/test/solr_json_writer_test.rb +248 -0
  24. data/test/test_helper.rb +5 -3
  25. data/test/test_support/demo_config.rb +0 -5
  26. data/test/translation_map_test.rb +9 -0
  27. data/traject.gemspec +18 -5
  28. metadata +77 -87
  29. data/lib/traject/marc4j_reader.rb +0 -153
  30. data/lib/traject/solrj_writer.rb +0 -351
  31. data/test/marc4j_reader_test.rb +0 -136
  32. data/test/solrj_writer_test.rb +0 -209
  33. data/vendor/solrj/README +0 -8
  34. data/vendor/solrj/build.xml +0 -39
  35. data/vendor/solrj/ivy.xml +0 -16
  36. data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
  37. data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
  38. data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
  39. data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
  40. data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
  41. data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
  42. data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
  43. data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
  44. data/vendor/solrj/lib/noggit-0.5.jar +0 -0
  45. data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
  46. data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
  47. data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
  48. data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
  49. data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
  50. data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
  51. data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
@@ -1,209 +0,0 @@
1
- require 'test_helper'
2
-
3
- require 'traject/solrj_writer'
4
-
5
- # It's crazy hard to test this effectively, especially under threading.
6
- # we do our best to test decently, and keep the tests readable,
7
- # but some things aren't quite reliable under threading, sorry.
8
-
9
- # create's a solrj_writer, maybe with MockSolrServer, maybe
10
- # with a real one. With settings in @settings, set or change
11
- # in before blocks
12
- #
13
- # writer left in @writer, with maybe mock solr server in @mock
14
- def create_solrj_writer
15
- @writer = Traject::SolrJWriter.new(@settings)
16
-
17
- if @settings["solrj_writer.server_class_name"] == "MockSolrServer"
18
- # so we can test it later
19
- @mock = @writer.solr_server
20
- end
21
- end
22
-
23
- def context_with(hash)
24
- Traject::Indexer::Context.new(:output_hash => hash)
25
- end
26
-
27
-
28
- # Some tests we need to run multiple ties in multiple batch/thread scenarios,
29
- # we DRY them up by creating a method to add the tests in different describe blocks
30
- def test_handles_errors
31
- it "errors but does not raise on multiple ID's" do
32
- @writer.put context_with("id" => ["one", "two"])
33
- @writer.close
34
- assert_equal 1, @writer.skipped_record_count, "counts skipped record"
35
- end
36
-
37
- it "errors and raises on connection error" do
38
- @settings.merge!("solr.url" => "http://no.such.place")
39
- create_solrj_writer
40
- assert_raises org.apache.solr.client.solrj.SolrServerException do
41
- @writer.put context_with("id" => ["one"])
42
- # in batch and/or thread scenarios, sometimes no exception raised until close
43
- @writer.close
44
- end
45
- end
46
- end
47
-
48
- $stderr.puts "\n======\nWARNING: Testing SolrJWriter with mock instance, set ENV 'solr_url' to test against real solr\n======\n\n" unless ENV["solr_url"]
49
- # WARNING. The SolrJWriter talks to a running Solr server.
50
- #
51
- # set ENV['solr_url'] to run tests against a real solr server
52
- # OR
53
- # the tests will run against a mock SolrJ server instead.
54
- #
55
- #
56
- # This is pretty limited test right now.
57
- describe "Traject::SolrJWriter" do
58
- before do
59
- @settings = {
60
- # Use XMLResponseParser just to test, and so it will work
61
- # with a solr 1.4 test server
62
- "solrj_writer.parser_class_name" => "XMLResponseParser",
63
- "solrj_writer.commit_on_close" => "false", # real solr is way too slow if we always have it commit on close
64
- "solrj_writer.batch_size" => nil
65
- }
66
-
67
- if ENV["solr_url"]
68
- @settings["solr.url"] = ENV["solr_url"]
69
- else
70
- @settings["solr.url"] = "http://example.org/solr"
71
- @settings["solrj_writer.server_class_name"] = "MockSolrServer"
72
- end
73
- end
74
-
75
- it "raises on missing url" do
76
- assert_raises(ArgumentError) { Traject::SolrJWriter.new }
77
- assert_raises(ArgumentError) { Traject::SolrJWriter.new("solr.url" => nil) }
78
- end
79
-
80
- it "raises on malformed URL" do
81
- assert_raises(ArgumentError) { Traject::SolrJWriter.new("solr.url" => "") }
82
- assert_raises(ArgumentError) { Traject::SolrJWriter.new("solr.url" => "adfadf") }
83
- end
84
-
85
- it "defaults to solrj_writer.batch_size more than 1" do
86
- assert 1 < Traject::SolrJWriter.new("solr.url" => "http://example.org/solr").settings["solrj_writer.batch_size"].to_i
87
- end
88
-
89
- describe "with no threading or batching" do
90
- before do
91
- @settings.merge!("solrj_writer.batch_size" => nil, "solrj_writer.thread_pool" => nil)
92
- create_solrj_writer
93
- end
94
-
95
- it "writes a simple document" do
96
- @writer.put context_with("title_t" => ["MY TESTING TITLE"], "id" => ["TEST_TEST_TEST_0001"])
97
- @writer.close
98
-
99
-
100
- if @mock
101
- assert_kind_of org.apache.solr.client.solrj.impl.XMLResponseParser, @mock.parser
102
- assert_equal @settings["solr.url"], @mock.url
103
-
104
- assert_equal 1, @mock.things_added.length
105
- assert_kind_of SolrInputDocument, @mock.things_added.first
106
-
107
- assert @mock.shutted_down
108
- end
109
- end
110
-
111
- it "commits on close when so set" do
112
- @settings.merge!("solrj_writer.commit_on_close" => "true")
113
- create_solrj_writer
114
-
115
- @writer.put context_with("title_t" => ["MY TESTING TITLE"], "id" => ["TEST_TEST_TEST_0001"])
116
- @writer.close
117
-
118
- # if it's not a mock, we don't really test anything, except that
119
- # no exception was raised. oh well. If it's a mock, we can
120
- # ask it.
121
- if @mock
122
- assert @mock.committed, "mock gets commit called on it"
123
- end
124
- end
125
-
126
- test_handles_errors
127
-
128
-
129
- # I got to see what serialized marc binary does against a real solr server,
130
- # sorry this is a bit out of place, but this is the class that talks to real
131
- # solr server right now. This test won't do much unless you have
132
- # real solr server set up.
133
- #
134
- # Not really a good test right now, just manually checking my solr server,
135
- # using this to make the add reproducible at least.
136
- describe "Serialized MARC" do
137
- it "goes to real solr somehow" do
138
- record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
139
-
140
- serialized = record.to_marc # straight binary
141
- @writer.put context_with("marc_record_t" => [serialized], "id" => ["TEST_TEST_TEST_MARC_BINARY"])
142
- @writer.close
143
- end
144
- end
145
- end
146
-
147
- describe "with batching but no threading" do
148
- before do
149
- @settings.merge!("solrj_writer.batch_size" => 5, "solrj_writer.thread_pool" => nil)
150
- create_solrj_writer
151
- end
152
-
153
- it "sends all documents" do
154
- docs = Array(1..17).collect do |i|
155
- {"id" => ["item_#{i}"], "title" => ["To be #{i} again!"]}
156
- end
157
-
158
- docs.each do |doc|
159
- @writer.put context_with(doc)
160
- end
161
- @writer.close
162
-
163
- if @mock
164
- # 3 batches of 5, and the leftover 2 (16, 17)
165
- assert_length 4, @mock.things_added
166
-
167
- assert_length 5, @mock.things_added[0]
168
- assert_length 5, @mock.things_added[1]
169
- assert_length 5, @mock.things_added[2]
170
- assert_length 2, @mock.things_added[3]
171
- end
172
- end
173
-
174
- test_handles_errors
175
- end
176
-
177
- describe "with batching and threading" do
178
- before do
179
- @settings.merge!("solrj_writer.batch_size" => 5, "solrj_writer.thread_pool" => 2)
180
- create_solrj_writer
181
- end
182
-
183
- it "sends all documents" do
184
- docs = Array(1..17).collect do |i|
185
- {"id" => ["item_#{i}"], "title" => ["To be #{i} again!"]}
186
- end
187
-
188
- docs.each do |doc|
189
- @writer.put context_with(doc)
190
- end
191
- @writer.close
192
-
193
- if @mock
194
- # 3 batches of 5, and the leftover 2 (16, 17)
195
- assert_length 4, @mock.things_added
196
-
197
- # we can't be sure of the order under async,
198
- # just three of 5 and one of 2
199
- assert_length 3, @mock.things_added.find_all {|array| array.length == 5}
200
- assert_length 1, @mock.things_added.find_all {|array| array.length == 2}
201
- end
202
- end
203
-
204
- test_handles_errors
205
- end
206
-
207
- end
208
-
209
- require 'thread' # Mutex
@@ -1,8 +0,0 @@
1
- Inside ./lib are all the jar files neccesary for solrj. They are used by the SolrJWriter.
2
-
3
- The build.xml and ivy.xml file included here were used to download the jars, and
4
- can be used to re-download them. Just run `ant` in this directory, and the contents of `./lib` will be replaced by the current latest release of solrj. Or edit ivy.xml to download a specific solrj version (perhaps change ivy.xml to use a java prop for release, defaulting to latest! ha.) And then commit changes to repo, etc, to update solrj distro'd with traject.
5
-
6
- This is not neccesarily a great way to provide access to solrj .jars. It's just what we're doing now, and it works. See main project README.md for discussion and other potential ideas.
7
-
8
- Note, the ivy.xml in here currently downloads a bit MORE than we really need, like .jars of docs and source. Haven't yet figured out how to tell it to download all maven-specified solrj jars that we really need, but not the ones we don't need. (we DO need logging-related ones to properly get logging working!) If you can figure it out, it'd be an improvement, as ALL jars in this dir are by default loaded by traject at runtime.
@@ -1,39 +0,0 @@
1
- <?xml version="1.0" encoding="utf-8"?>
2
- <project xmlns:ivy="antlib:org.apache.ivy.ant" name="traject-fetch-jars" default="prepare" basedir=".">
3
-
4
-
5
-
6
-
7
-
8
- <target name="prepare" depends="setup-ivy">
9
- <mkdir dir="lib"/>
10
- <ivy:retrieve sync="true"/>
11
- </target>
12
-
13
- <target name="clean">
14
- <delete dir="lib"/>
15
- </target>
16
-
17
-
18
-
19
- <property name="ivy.install.version" value="2.3.0"/>
20
- <property name="ivy.jar.dir" value="ivy"/>
21
- <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar"/>
22
-
23
- <available file="${ivy.jar.file}" property="skip.download"/>
24
-
25
- <target name="download-ivy" unless="skip.download">
26
- <mkdir dir="${ivy.jar.dir}"/>
27
-
28
- <echo message="installing ivy..."/>
29
- <get src="http://repo1.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" dest="${ivy.jar.file}" usetimestamp="true"/>
30
- </target>
31
-
32
- <target name="setup-ivy" depends="download-ivy" description="--> setup ivy">
33
- <path id="ivy.lib.path">
34
- <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
35
- </path>
36
- <taskdef resource="org/apache/ivy/ant/antlib.xml" uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
37
- </target>
38
-
39
- </project>
@@ -1,16 +0,0 @@
1
- <ivy-module version="2.0">
2
- <info organisation="org.code4lib" module="traject"/>
3
-
4
- <dependencies>
5
- <!-- downloads EVERYTHING including docs and source we don't need. Oh well, it
6
- works for prototyping at least... -->
7
- <dependency org="org.apache.solr" name="solr-solrj" rev="latest.release"/>
8
-
9
-
10
- <!-- Attempts to give us just what we need, including working logging, still
11
- not quite right, but leaving here for thinking... -->
12
- <!-- <dependency org="org.apache.solr" name="solr-solrj" rev="latest.release" conf="default" />
13
- <dependency org="org.slf4j" name="slf4j-simple" rev="latest.release"/> -->
14
- </dependencies>
15
- </ivy-module>
16
-