traject 1.0.0.beta.1 → 1.0.0.beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/traject/indexer.rb +4 -4
- data/lib/traject/macros/marc21.rb +11 -1
- data/lib/traject/macros/marc21_semantics.rb +69 -1
- data/lib/traject/version.rb +1 -1
- data/test/indexer/macros_marc21_semantics_test.rb +47 -1
- data/test/indexer/macros_marc21_test.rb +3 -0
- data/test/test_support/demo_config.rb +1 -1
- data/test/test_support/george_eliot.marc +1 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cb35b4c5ba302cb865b459bfac6859ef6be68927
|
4
|
+
data.tar.gz: 14964b88428d0a827932cbf17194a77c56de1091
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3ae114fe4a11baaf6f6470d35d5df339a32b8f7e747f012310ee90ed55b982754d196abcc25ec0375d8bf988ca5e3ebd2c71ef7df159bc7007e6cdae9c69643
|
7
|
+
data.tar.gz: 74c6c6170860cf1cd4883d644f9c23151a703ff996592abb8668a655dcff885c108c7e04e13419f6c4ba727384b3bab3dff896c177382284b8d12b98cc711225
|
data/README.md
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
Tools for reading MARC records, transforming them with indexing rules, and indexing to Solr.
|
4
4
|
Might be used to index MARC data for a Solr-based discovery product like [Blacklight](https://github.com/projectblacklight/blacklight) or [VUFind](http://vufind.org/).
|
5
5
|
|
6
|
-
Traject might also be generalized to a set of tools for getting structured data from a source, and
|
6
|
+
Traject might also be generalized to a set of tools for getting structured data from a source, and transforming it to a hash-like object to send to a destination.
|
7
7
|
|
8
8
|
|
9
9
|
**Traject is nearing 1.0, it is robust, feature-rich and being used in production by authors -- feedback invited**
|
data/lib/traject/indexer.rb
CHANGED
@@ -7,7 +7,6 @@ require 'traject/indexer/settings'
|
|
7
7
|
require 'traject/marc_reader'
|
8
8
|
require 'traject/marc4j_reader'
|
9
9
|
require 'traject/json_writer'
|
10
|
-
require 'traject/solrj_writer'
|
11
10
|
|
12
11
|
require 'traject/macros/marc21'
|
13
12
|
require 'traject/macros/basic'
|
@@ -71,9 +70,10 @@ require 'traject/macros/basic'
|
|
71
70
|
# 4) Optionally implements a #skipped_record_count method, returning int count of records
|
72
71
|
# that were skipped due to errors (and presumably logged)
|
73
72
|
#
|
74
|
-
# The default writer
|
75
|
-
#
|
76
|
-
#
|
73
|
+
# The default writer is the SolrJWriter, using Java SolrJ to
|
74
|
+
# write to a Solr. A few other built-in writers are available,
|
75
|
+
# but it's anticipated more will be created as plugins or local
|
76
|
+
# code for special purposes.
|
77
77
|
#
|
78
78
|
# You can set alternate writers by setting a Class object directly
|
79
79
|
# with the #writer_class method, or by the 'writer_class_name' Setting,
|
@@ -191,8 +191,18 @@ module Traject::Macros
|
|
191
191
|
#
|
192
192
|
# Returns altered string, doesn't change original arg.
|
193
193
|
def self.trim_punctuation(str)
|
194
|
+
|
195
|
+
# If something went wrong and we got a nil, just return it
|
196
|
+
return str unless str
|
197
|
+
|
198
|
+
# trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
|
194
199
|
str = str.sub(/ *[ ,\/;:] *\Z/, '')
|
195
|
-
|
200
|
+
|
201
|
+
# trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
|
202
|
+
str = str.sub(/( *\w\w\w)\. *\Z/, '\1')
|
203
|
+
|
204
|
+
# single square bracket characters if they are the start and/or end
|
205
|
+
# chars and there are no internal square brackets.
|
196
206
|
str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
|
197
207
|
return str
|
198
208
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# Encoding: UTF-8
|
2
|
+
|
1
3
|
require 'traject/marc_extractor'
|
2
4
|
|
3
5
|
module Traject::Macros
|
@@ -81,7 +83,8 @@ module Traject::Macros
|
|
81
83
|
# 245 a and b, with non-filing characters stripped off
|
82
84
|
def marc_sortable_title
|
83
85
|
lambda do |record, accumulator|
|
84
|
-
|
86
|
+
st = Marc21Semantics.get_sortable_title(record)
|
87
|
+
accumulator << st if st
|
85
88
|
end
|
86
89
|
end
|
87
90
|
|
@@ -503,6 +506,71 @@ module Traject::Macros
|
|
503
506
|
end
|
504
507
|
end
|
505
508
|
|
509
|
+
# Extracts LCSH-carrying fields, and formatting them
|
510
|
+
# as a pre-coordinated LCSH string, for instance suitable for including
|
511
|
+
# in a facet.
|
512
|
+
#
|
513
|
+
# You can supply your own list of fields as a spec, but for significant
|
514
|
+
# customization you probably just want to write your own method in
|
515
|
+
# terms of the Marc21Semantics.assemble_lcsh method.
|
516
|
+
def marc_lcsh_formatted(options = {})
|
517
|
+
spec = options[:spec] || "600:610:611:630:648:650:651:654:6662"
|
518
|
+
subd_separator = options[:subdivison_separator] || " — "
|
519
|
+
other_separator = options[:other_separator] || " "
|
520
|
+
|
521
|
+
extractor = MarcExtractor.new(spec)
|
522
|
+
|
523
|
+
return lambda do |record, accumulator|
|
524
|
+
accumulator.concat( extractor.collect_matching_lines(record) do |field, spec|
|
525
|
+
Marc21Semantics.assemble_lcsh(field, subd_separator, other_separator)
|
526
|
+
end)
|
527
|
+
end
|
528
|
+
|
529
|
+
end
|
530
|
+
|
531
|
+
# Takes a MARC::Field and formats it into a pre-coordinated LCSH string
|
532
|
+
# with subdivision seperators in the right place.
|
533
|
+
#
|
534
|
+
# For 600 fields especially, need to not just join with subdivision seperator
|
535
|
+
# to take acount of $a$d$t -- for other fields, might be able to just
|
536
|
+
# join subfields, not sure.
|
537
|
+
#
|
538
|
+
# WILL strip trailing period from generated string, contrary to some LCSH practice.
|
539
|
+
# Our data is inconsistent on whether it has period or not, this was
|
540
|
+
# the easiest way to standardize.
|
541
|
+
#
|
542
|
+
# Default subdivision seperator is em-dash with spaces, set to '--' if you want.
|
543
|
+
#
|
544
|
+
# Cite: "Dash (-) that precedes a subdivision in an extended 600 subject heading
|
545
|
+
# is not carried in the MARC record. It may be system generated as a display constant
|
546
|
+
# associated with the content of subfield $v, $x, $y, and $z."
|
547
|
+
# http://www.loc.gov/marc/bibliographic/bd600.html
|
548
|
+
def self.assemble_lcsh(marc_field, subd_separator = " — ", other_separator = " ")
|
549
|
+
str = ""
|
550
|
+
subd_prefix_codes = %w{v x y z}
|
551
|
+
|
552
|
+
|
553
|
+
marc_field.subfields.each_with_index do |sf, i|
|
554
|
+
# ignore non-alphabetic, like numeric control subfields
|
555
|
+
next unless sf.code =~ /\A[a-z]\Z/
|
556
|
+
|
557
|
+
prefix = if subd_prefix_codes.include? sf.code
|
558
|
+
subd_separator
|
559
|
+
elsif i == 0
|
560
|
+
""
|
561
|
+
else
|
562
|
+
other_separator
|
563
|
+
end
|
564
|
+
str << prefix << sf.value
|
565
|
+
end
|
566
|
+
|
567
|
+
str.gsub!(/\.\Z/, '')
|
568
|
+
|
569
|
+
return nil if str == ""
|
570
|
+
|
571
|
+
return str
|
572
|
+
end
|
573
|
+
|
506
574
|
|
507
575
|
end
|
508
576
|
end
|
data/lib/traject/version.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# Encoding: UTF-8
|
2
|
+
|
1
3
|
require 'test_helper'
|
2
4
|
|
3
5
|
require 'traject/indexer'
|
@@ -231,7 +233,52 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
231
233
|
assert_equal ["Early modern, 1500-1700", "17th century", "Great Britain: Puritan Revolution, 1642-1660", "Great Britain: Civil War, 1642-1649", "1642-1660"],
|
232
234
|
output["era_facet"]
|
233
235
|
end
|
236
|
+
end
|
237
|
+
|
238
|
+
describe "marc_lcsh_display" do
|
239
|
+
it "formats typical field" do
|
240
|
+
field = MARC::DataField.new('650', ' ', ' ', ['a', 'Psychoanalysis and literature'], ['z', 'England'], ['x', 'History'], ['y', '19th century.'])
|
241
|
+
str = Marc21Semantics.assemble_lcsh(field)
|
242
|
+
|
243
|
+
assert_equal "Psychoanalysis and literature — England — History — 19th century", str
|
244
|
+
end
|
245
|
+
|
246
|
+
it "ignores numeric subfields" do
|
247
|
+
field = MARC::DataField.new('650', ' ', ' ', ['a', 'Psychoanalysis and literature'], ['x', 'History'], ['0', '01234'], ['3', 'Some part'])
|
248
|
+
str = Marc21Semantics.assemble_lcsh(field)
|
249
|
+
|
250
|
+
assert_equal "Psychoanalysis and literature — History", str
|
251
|
+
end
|
252
|
+
|
253
|
+
it "doesn't put subdivision in wrong place" do
|
254
|
+
field = MARC::DataField.new('600', ' ', ' ', ['a', 'Eliot, George,'],['d', '1819-1880.'], ['t', 'Middlemarch'])
|
255
|
+
str = Marc21Semantics.assemble_lcsh(field)
|
256
|
+
|
257
|
+
assert_equal "Eliot, George, 1819-1880. Middlemarch", str
|
258
|
+
end
|
259
|
+
|
260
|
+
it "mixes non-subdivisions with subdivisions" do
|
261
|
+
field = MARC::DataField.new('600', ' ', ' ', ['a', 'Eliot, George,'],['d', '1819-1880.'], ['t', 'Middlemarch'], ['x', 'Criticism.'])
|
262
|
+
str = Marc21Semantics.assemble_lcsh(field)
|
263
|
+
|
264
|
+
assert_equal "Eliot, George, 1819-1880. Middlemarch — Criticism", str
|
265
|
+
end
|
266
|
+
|
267
|
+
it "returns nil for a field with no relevant subfields" do
|
268
|
+
field = MARC::DataField.new('650', ' ', ' ')
|
269
|
+
assert_nil Marc21Semantics.assemble_lcsh(field)
|
270
|
+
end
|
271
|
+
|
272
|
+
describe "marc_lcsh_formatted macro" do
|
273
|
+
it "smoke test" do
|
274
|
+
@record = MARC::Reader.new(support_file_path "george_eliot.marc").to_a.first
|
275
|
+
@indexer.instance_eval {to_field "lcsh", marc_lcsh_formatted}
|
276
|
+
output = @indexer.map_record(@record)
|
234
277
|
|
278
|
+
assert output["lcsh"].length > 0, "outputs data"
|
279
|
+
assert output["lcsh"].include?("Eliot, George, 1819-1880 — Characters"), "includes a string its supposed to"
|
280
|
+
end
|
281
|
+
end
|
235
282
|
end
|
236
283
|
|
237
284
|
describe "extract_marc_filing_version" do
|
@@ -272,7 +319,6 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
272
319
|
end
|
273
320
|
end
|
274
321
|
|
275
|
-
|
276
322
|
end
|
277
323
|
|
278
324
|
|
@@ -97,6 +97,9 @@ describe "Traject::Macros::Marc21" do
|
|
97
97
|
assert_equal "one two three", Marc21.trim_punctuation("one two three]")
|
98
98
|
assert_equal "one two three", Marc21.trim_punctuation("[one two three")
|
99
99
|
assert_equal "one two three", Marc21.trim_punctuation("[one two three]")
|
100
|
+
|
101
|
+
# This one was a bug before
|
102
|
+
assert_equal "Feminism and art", Marc21.trim_punctuation("Feminism and art.")
|
100
103
|
end
|
101
104
|
|
102
105
|
it "uses :translation_map" do
|
@@ -20,7 +20,7 @@ extend Traject::Macros::MarcFormats
|
|
20
20
|
# files however you like, you can call traject with as many
|
21
21
|
# config files as you like, `traject -c one.rb -c two.rb -c etc.rb`
|
22
22
|
settings do
|
23
|
-
provide "solr.url", "http://
|
23
|
+
provide "solr.url", "http://solr.somewhere.edu:8983/solr/corename"
|
24
24
|
|
25
25
|
# Only if you need to connect to a Solr 1.x:
|
26
26
|
provide "solrj_writer.parser_class_name", "XMLResponseParser"
|
@@ -0,0 +1 @@
|
|
1
|
+
01359cam a2200361 a 4500001000800000005001700008008004100025010001700066020002800083020003500111035001600146040001300162043001200175049000900187050002500196082001500221100002200236245009700258260005800355300002700413440004600440504006400486600005400550600004700604600004400651600004300695650006700738650005800805650002900863910002600892994001200918991006700930232964520030805093128.0020925s2003 nyu b s001 0 eng a 2002036483 a0791458334 (alk. paper) a0791458342 (pbk. : alk. paper) aocm50737282 aDLCcDLC ae-uk-en aJHEE00aPR4692.P74bP37 200300a823/.82211 aParis, Bernard J.10aRereading George Eliot :bchanging responses to her experiments in life /cBernard J. Paris. aAlbany :bState University of New York Press,cc2003. axiii, 220 p. ;c23 cm. 0aSUNY series in psychoanalysis and culture aIncludes bibliographical references (p. 213-215) and index.10aEliot, George,d1819-1880xKnowledgexPsychology.10aEliot, George,d1819-1880.tDaniel Deronda10aEliot, George,d1819-1880.tMiddlemarch10aEliot, George,d1819-1880xCharacters. 0aPsychoanalysis and literaturezEnglandxHistoryy19th century. 0aPsychological fiction, EnglishxHistory and criticism 0aPsychology in literature a2329645bHorizon bib# aE0bJHE aPR4692.P74 P37 2003flcbelc1cc. 1q0i3857076lembluememsel
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.0.beta.
|
4
|
+
version: 1.0.0.beta.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-10-
|
12
|
+
date: 2013-10-17 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: marc
|
@@ -216,6 +216,7 @@ files:
|
|
216
216
|
- test/test_support/date_with_u.marc
|
217
217
|
- test/test_support/demo_config.rb
|
218
218
|
- test/test_support/emptyish_record.marc
|
219
|
+
- test/test_support/george_eliot.marc
|
219
220
|
- test/test_support/hebrew880s.marc
|
220
221
|
- test/test_support/louis_armstrong.marc
|
221
222
|
- test/test_support/manufacturing_consent.marc
|
@@ -313,6 +314,7 @@ test_files:
|
|
313
314
|
- test/test_support/date_with_u.marc
|
314
315
|
- test/test_support/demo_config.rb
|
315
316
|
- test/test_support/emptyish_record.marc
|
317
|
+
- test/test_support/george_eliot.marc
|
316
318
|
- test/test_support/hebrew880s.marc
|
317
319
|
- test/test_support/louis_armstrong.marc
|
318
320
|
- test/test_support/manufacturing_consent.marc
|