traject 1.1.0 → 2.0.0.rc.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +20 -0
- data/README.md +85 -73
- data/doc/batch_execution.md +2 -6
- data/doc/other_commands.md +3 -5
- data/doc/settings.md +27 -38
- data/lib/traject/command_line.rb +1 -1
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +29 -11
- data/lib/traject/indexer/settings.rb +39 -13
- data/lib/traject/line_writer.rb +10 -6
- data/lib/traject/marc_reader.rb +2 -1
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +38 -48
- data/lib/traject/translation_map.rb +3 -0
- data/lib/traject/util.rb +13 -51
- data/lib/traject/version.rb +1 -1
- data/lib/translation_maps/marc_geographic.yaml +2 -2
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/read_write_test.rb +0 -22
- data/test/indexer/settings_test.rb +24 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +5 -3
- data/test/test_support/demo_config.rb +0 -5
- data/test/translation_map_test.rb +9 -0
- data/traject.gemspec +18 -5
- metadata +77 -87
- data/lib/traject/marc4j_reader.rb +0 -153
- data/lib/traject/solrj_writer.rb +0 -351
- data/test/marc4j_reader_test.rb +0 -136
- data/test/solrj_writer_test.rb +0 -209
- data/vendor/solrj/README +0 -8
- data/vendor/solrj/build.xml +0 -39
- data/vendor/solrj/ivy.xml +0 -16
- data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
- data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
- data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
- data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
- data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
- data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
- data/vendor/solrj/lib/noggit-0.5.jar +0 -0
- data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
- data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
- data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
- data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
data/test/solrj_writer_test.rb
DELETED
@@ -1,209 +0,0 @@
|
|
1
|
-
require 'test_helper'
|
2
|
-
|
3
|
-
require 'traject/solrj_writer'
|
4
|
-
|
5
|
-
# It's crazy hard to test this effectively, especially under threading.
|
6
|
-
# we do our best to test decently, and keep the tests readable,
|
7
|
-
# but some things aren't quite reliable under threading, sorry.
|
8
|
-
|
9
|
-
# create's a solrj_writer, maybe with MockSolrServer, maybe
|
10
|
-
# with a real one. With settings in @settings, set or change
|
11
|
-
# in before blocks
|
12
|
-
#
|
13
|
-
# writer left in @writer, with maybe mock solr server in @mock
|
14
|
-
def create_solrj_writer
|
15
|
-
@writer = Traject::SolrJWriter.new(@settings)
|
16
|
-
|
17
|
-
if @settings["solrj_writer.server_class_name"] == "MockSolrServer"
|
18
|
-
# so we can test it later
|
19
|
-
@mock = @writer.solr_server
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def context_with(hash)
|
24
|
-
Traject::Indexer::Context.new(:output_hash => hash)
|
25
|
-
end
|
26
|
-
|
27
|
-
|
28
|
-
# Some tests we need to run multiple ties in multiple batch/thread scenarios,
|
29
|
-
# we DRY them up by creating a method to add the tests in different describe blocks
|
30
|
-
def test_handles_errors
|
31
|
-
it "errors but does not raise on multiple ID's" do
|
32
|
-
@writer.put context_with("id" => ["one", "two"])
|
33
|
-
@writer.close
|
34
|
-
assert_equal 1, @writer.skipped_record_count, "counts skipped record"
|
35
|
-
end
|
36
|
-
|
37
|
-
it "errors and raises on connection error" do
|
38
|
-
@settings.merge!("solr.url" => "http://no.such.place")
|
39
|
-
create_solrj_writer
|
40
|
-
assert_raises org.apache.solr.client.solrj.SolrServerException do
|
41
|
-
@writer.put context_with("id" => ["one"])
|
42
|
-
# in batch and/or thread scenarios, sometimes no exception raised until close
|
43
|
-
@writer.close
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
$stderr.puts "\n======\nWARNING: Testing SolrJWriter with mock instance, set ENV 'solr_url' to test against real solr\n======\n\n" unless ENV["solr_url"]
|
49
|
-
# WARNING. The SolrJWriter talks to a running Solr server.
|
50
|
-
#
|
51
|
-
# set ENV['solr_url'] to run tests against a real solr server
|
52
|
-
# OR
|
53
|
-
# the tests will run against a mock SolrJ server instead.
|
54
|
-
#
|
55
|
-
#
|
56
|
-
# This is pretty limited test right now.
|
57
|
-
describe "Traject::SolrJWriter" do
|
58
|
-
before do
|
59
|
-
@settings = {
|
60
|
-
# Use XMLResponseParser just to test, and so it will work
|
61
|
-
# with a solr 1.4 test server
|
62
|
-
"solrj_writer.parser_class_name" => "XMLResponseParser",
|
63
|
-
"solrj_writer.commit_on_close" => "false", # real solr is way too slow if we always have it commit on close
|
64
|
-
"solrj_writer.batch_size" => nil
|
65
|
-
}
|
66
|
-
|
67
|
-
if ENV["solr_url"]
|
68
|
-
@settings["solr.url"] = ENV["solr_url"]
|
69
|
-
else
|
70
|
-
@settings["solr.url"] = "http://example.org/solr"
|
71
|
-
@settings["solrj_writer.server_class_name"] = "MockSolrServer"
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
it "raises on missing url" do
|
76
|
-
assert_raises(ArgumentError) { Traject::SolrJWriter.new }
|
77
|
-
assert_raises(ArgumentError) { Traject::SolrJWriter.new("solr.url" => nil) }
|
78
|
-
end
|
79
|
-
|
80
|
-
it "raises on malformed URL" do
|
81
|
-
assert_raises(ArgumentError) { Traject::SolrJWriter.new("solr.url" => "") }
|
82
|
-
assert_raises(ArgumentError) { Traject::SolrJWriter.new("solr.url" => "adfadf") }
|
83
|
-
end
|
84
|
-
|
85
|
-
it "defaults to solrj_writer.batch_size more than 1" do
|
86
|
-
assert 1 < Traject::SolrJWriter.new("solr.url" => "http://example.org/solr").settings["solrj_writer.batch_size"].to_i
|
87
|
-
end
|
88
|
-
|
89
|
-
describe "with no threading or batching" do
|
90
|
-
before do
|
91
|
-
@settings.merge!("solrj_writer.batch_size" => nil, "solrj_writer.thread_pool" => nil)
|
92
|
-
create_solrj_writer
|
93
|
-
end
|
94
|
-
|
95
|
-
it "writes a simple document" do
|
96
|
-
@writer.put context_with("title_t" => ["MY TESTING TITLE"], "id" => ["TEST_TEST_TEST_0001"])
|
97
|
-
@writer.close
|
98
|
-
|
99
|
-
|
100
|
-
if @mock
|
101
|
-
assert_kind_of org.apache.solr.client.solrj.impl.XMLResponseParser, @mock.parser
|
102
|
-
assert_equal @settings["solr.url"], @mock.url
|
103
|
-
|
104
|
-
assert_equal 1, @mock.things_added.length
|
105
|
-
assert_kind_of SolrInputDocument, @mock.things_added.first
|
106
|
-
|
107
|
-
assert @mock.shutted_down
|
108
|
-
end
|
109
|
-
end
|
110
|
-
|
111
|
-
it "commits on close when so set" do
|
112
|
-
@settings.merge!("solrj_writer.commit_on_close" => "true")
|
113
|
-
create_solrj_writer
|
114
|
-
|
115
|
-
@writer.put context_with("title_t" => ["MY TESTING TITLE"], "id" => ["TEST_TEST_TEST_0001"])
|
116
|
-
@writer.close
|
117
|
-
|
118
|
-
# if it's not a mock, we don't really test anything, except that
|
119
|
-
# no exception was raised. oh well. If it's a mock, we can
|
120
|
-
# ask it.
|
121
|
-
if @mock
|
122
|
-
assert @mock.committed, "mock gets commit called on it"
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
test_handles_errors
|
127
|
-
|
128
|
-
|
129
|
-
# I got to see what serialized marc binary does against a real solr server,
|
130
|
-
# sorry this is a bit out of place, but this is the class that talks to real
|
131
|
-
# solr server right now. This test won't do much unless you have
|
132
|
-
# real solr server set up.
|
133
|
-
#
|
134
|
-
# Not really a good test right now, just manually checking my solr server,
|
135
|
-
# using this to make the add reproducible at least.
|
136
|
-
describe "Serialized MARC" do
|
137
|
-
it "goes to real solr somehow" do
|
138
|
-
record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
139
|
-
|
140
|
-
serialized = record.to_marc # straight binary
|
141
|
-
@writer.put context_with("marc_record_t" => [serialized], "id" => ["TEST_TEST_TEST_MARC_BINARY"])
|
142
|
-
@writer.close
|
143
|
-
end
|
144
|
-
end
|
145
|
-
end
|
146
|
-
|
147
|
-
describe "with batching but no threading" do
|
148
|
-
before do
|
149
|
-
@settings.merge!("solrj_writer.batch_size" => 5, "solrj_writer.thread_pool" => nil)
|
150
|
-
create_solrj_writer
|
151
|
-
end
|
152
|
-
|
153
|
-
it "sends all documents" do
|
154
|
-
docs = Array(1..17).collect do |i|
|
155
|
-
{"id" => ["item_#{i}"], "title" => ["To be #{i} again!"]}
|
156
|
-
end
|
157
|
-
|
158
|
-
docs.each do |doc|
|
159
|
-
@writer.put context_with(doc)
|
160
|
-
end
|
161
|
-
@writer.close
|
162
|
-
|
163
|
-
if @mock
|
164
|
-
# 3 batches of 5, and the leftover 2 (16, 17)
|
165
|
-
assert_length 4, @mock.things_added
|
166
|
-
|
167
|
-
assert_length 5, @mock.things_added[0]
|
168
|
-
assert_length 5, @mock.things_added[1]
|
169
|
-
assert_length 5, @mock.things_added[2]
|
170
|
-
assert_length 2, @mock.things_added[3]
|
171
|
-
end
|
172
|
-
end
|
173
|
-
|
174
|
-
test_handles_errors
|
175
|
-
end
|
176
|
-
|
177
|
-
describe "with batching and threading" do
|
178
|
-
before do
|
179
|
-
@settings.merge!("solrj_writer.batch_size" => 5, "solrj_writer.thread_pool" => 2)
|
180
|
-
create_solrj_writer
|
181
|
-
end
|
182
|
-
|
183
|
-
it "sends all documents" do
|
184
|
-
docs = Array(1..17).collect do |i|
|
185
|
-
{"id" => ["item_#{i}"], "title" => ["To be #{i} again!"]}
|
186
|
-
end
|
187
|
-
|
188
|
-
docs.each do |doc|
|
189
|
-
@writer.put context_with(doc)
|
190
|
-
end
|
191
|
-
@writer.close
|
192
|
-
|
193
|
-
if @mock
|
194
|
-
# 3 batches of 5, and the leftover 2 (16, 17)
|
195
|
-
assert_length 4, @mock.things_added
|
196
|
-
|
197
|
-
# we can't be sure of the order under async,
|
198
|
-
# just three of 5 and one of 2
|
199
|
-
assert_length 3, @mock.things_added.find_all {|array| array.length == 5}
|
200
|
-
assert_length 1, @mock.things_added.find_all {|array| array.length == 2}
|
201
|
-
end
|
202
|
-
end
|
203
|
-
|
204
|
-
test_handles_errors
|
205
|
-
end
|
206
|
-
|
207
|
-
end
|
208
|
-
|
209
|
-
require 'thread' # Mutex
|
data/vendor/solrj/README
DELETED
@@ -1,8 +0,0 @@
|
|
1
|
-
Inside ./lib are all the jar files neccesary for solrj. They are used by the SolrJWriter.
|
2
|
-
|
3
|
-
The build.xml and ivy.xml file included here were used to download the jars, and
|
4
|
-
can be used to re-download them. Just run `ant` in this directory, and the contents of `./lib` will be replaced by the current latest release of solrj. Or edit ivy.xml to download a specific solrj version (perhaps change ivy.xml to use a java prop for release, defaulting to latest! ha.) And then commit changes to repo, etc, to update solrj distro'd with traject.
|
5
|
-
|
6
|
-
This is not neccesarily a great way to provide access to solrj .jars. It's just what we're doing now, and it works. See main project README.md for discussion and other potential ideas.
|
7
|
-
|
8
|
-
Note, the ivy.xml in here currently downloads a bit MORE than we really need, like .jars of docs and source. Haven't yet figured out how to tell it to download all maven-specified solrj jars that we really need, but not the ones we don't need. (we DO need logging-related ones to properly get logging working!) If you can figure it out, it'd be an improvement, as ALL jars in this dir are by default loaded by traject at runtime.
|
data/vendor/solrj/build.xml
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="utf-8"?>
|
2
|
-
<project xmlns:ivy="antlib:org.apache.ivy.ant" name="traject-fetch-jars" default="prepare" basedir=".">
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
<target name="prepare" depends="setup-ivy">
|
9
|
-
<mkdir dir="lib"/>
|
10
|
-
<ivy:retrieve sync="true"/>
|
11
|
-
</target>
|
12
|
-
|
13
|
-
<target name="clean">
|
14
|
-
<delete dir="lib"/>
|
15
|
-
</target>
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
<property name="ivy.install.version" value="2.3.0"/>
|
20
|
-
<property name="ivy.jar.dir" value="ivy"/>
|
21
|
-
<property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar"/>
|
22
|
-
|
23
|
-
<available file="${ivy.jar.file}" property="skip.download"/>
|
24
|
-
|
25
|
-
<target name="download-ivy" unless="skip.download">
|
26
|
-
<mkdir dir="${ivy.jar.dir}"/>
|
27
|
-
|
28
|
-
<echo message="installing ivy..."/>
|
29
|
-
<get src="http://repo1.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" dest="${ivy.jar.file}" usetimestamp="true"/>
|
30
|
-
</target>
|
31
|
-
|
32
|
-
<target name="setup-ivy" depends="download-ivy" description="--> setup ivy">
|
33
|
-
<path id="ivy.lib.path">
|
34
|
-
<fileset dir="${ivy.jar.dir}" includes="*.jar"/>
|
35
|
-
</path>
|
36
|
-
<taskdef resource="org/apache/ivy/ant/antlib.xml" uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
|
37
|
-
</target>
|
38
|
-
|
39
|
-
</project>
|
data/vendor/solrj/ivy.xml
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
<ivy-module version="2.0">
|
2
|
-
<info organisation="org.code4lib" module="traject"/>
|
3
|
-
|
4
|
-
<dependencies>
|
5
|
-
<!-- downloads EVERYTHING including docs and source we don't need. Oh well, it
|
6
|
-
works for prototyping at least... -->
|
7
|
-
<dependency org="org.apache.solr" name="solr-solrj" rev="latest.release"/>
|
8
|
-
|
9
|
-
|
10
|
-
<!-- Attempts to give us just what we need, including working logging, still
|
11
|
-
not quite right, but leaving here for thinking... -->
|
12
|
-
<!-- <dependency org="org.apache.solr" name="solr-solrj" rev="latest.release" conf="default" />
|
13
|
-
<dependency org="org.slf4j" name="slf4j-simple" rev="latest.release"/> -->
|
14
|
-
</dependencies>
|
15
|
-
</ivy-module>
|
16
|
-
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|