traject 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +18 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +346 -0
- data/Rakefile +16 -0
- data/bin/traject +153 -0
- data/doc/macros.md +103 -0
- data/doc/settings.md +34 -0
- data/lib/traject.rb +10 -0
- data/lib/traject/indexer.rb +196 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +145 -0
- data/lib/traject/marc_extractor.rb +206 -0
- data/lib/traject/marc_reader.rb +61 -0
- data/lib/traject/qualified_const_get.rb +30 -0
- data/lib/traject/solrj_writer.rb +120 -0
- data/lib/traject/translation_map.rb +184 -0
- data/lib/traject/version.rb +3 -0
- data/test/indexer/macros_marc21_test.rb +146 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +120 -0
- data/test/indexer/read_write_test.rb +47 -0
- data/test/indexer/settings_test.rb +65 -0
- data/test/marc_extractor_test.rb +168 -0
- data/test/marc_reader_test.rb +29 -0
- data/test/solrj_writer_test.rb +106 -0
- data/test/test_helper.rb +28 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/translation_map_test.rb +98 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +30 -0
- data/vendor/solrj/README +8 -0
- data/vendor/solrj/build.xml +39 -0
- data/vendor/solrj/ivy.xml +16 -0
- data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
- data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
- data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
- data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
- data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
- data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
- data/vendor/solrj/lib/noggit-0.5.jar +0 -0
- data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
- data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
- data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
- data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
- metadata +264 -0
@@ -0,0 +1,206 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Traject
|
4
|
+
# MarcExtractor is a class for extracting lists of strings from a MARC::Record,
|
5
|
+
# according to specifications. See #parse_string_spec for description of string
|
6
|
+
# string arguments used to specify extraction. See #initialize for options
|
7
|
+
# that can be set controlling extraction.
|
8
|
+
#
|
9
|
+
# Examples:
|
10
|
+
#
|
11
|
+
# array_of_stuff = MarcExtractor.new(marc_record, "001:245abc:700a").extract
|
12
|
+
# values = MarcExtractor.new(marc_record, "040a", :seperator => nil).extract
|
13
|
+
#
|
14
|
+
class MarcExtractor
|
15
|
+
attr_accessor :options, :marc_record, :spec_hash
|
16
|
+
|
17
|
+
|
18
|
+
# Convenience method to construct a MarcExtractor object and
|
19
|
+
# run extract on it.
|
20
|
+
#
|
21
|
+
# First arg is a marc record.
|
22
|
+
#
|
23
|
+
# Second arg is either a string that will be given to parse_string_spec,
|
24
|
+
# OR a hash that's the return value of parse_string_spec.
|
25
|
+
#
|
26
|
+
# Third arg is an optional options hash that will be passed as
|
27
|
+
# third arg of MarcExtractor constructor.
|
28
|
+
def self.extract_by_spec(marc_record, specification, options = {})
|
29
|
+
(raise IllegalArgument, "first argument must not be nil") if marc_record.nil?
|
30
|
+
|
31
|
+
unless specification.kind_of? Hash
|
32
|
+
specification = self.parse_string_spec(specification)
|
33
|
+
end
|
34
|
+
|
35
|
+
Traject::MarcExtractor.new(marc_record, specification, options).extract
|
36
|
+
end
|
37
|
+
|
38
|
+
# Take a hash that's the output of #parse_string_spec, return
|
39
|
+
# an array of strings extracted from a marc record accordingly
|
40
|
+
#
|
41
|
+
# options:
|
42
|
+
#
|
43
|
+
# [:seperator] default ' ' (space), what to use to seperate
|
44
|
+
# subfield values when joining strings
|
45
|
+
#
|
46
|
+
# [:alternate_script] default :include, include linked 880s for tags
|
47
|
+
# that match spec. Also:
|
48
|
+
# * false => do not include.
|
49
|
+
# * :only => only include linked 880s, not original
|
50
|
+
def initialize(marc_record, spec_hash, options = {})
|
51
|
+
self.options = {
|
52
|
+
:seperator => ' ',
|
53
|
+
:alternate_script => :include
|
54
|
+
}.merge(options)
|
55
|
+
|
56
|
+
raise IllegalArgumentException("second arg to MarcExtractor.new must be a Hash specification object") unless spec_hash.kind_of? Hash
|
57
|
+
|
58
|
+
self.marc_record = marc_record
|
59
|
+
self.spec_hash = spec_hash
|
60
|
+
end
|
61
|
+
|
62
|
+
# Converts from a string marc spec like "245abc:700a" to a nested hash used internally
|
63
|
+
# to represent the specification.
|
64
|
+
#
|
65
|
+
# a String specification is a string of form:
|
66
|
+
# {tag}{|indicators|}{subfields} seperated by colons
|
67
|
+
# tag is three chars (usually but not neccesarily numeric),
|
68
|
+
# indicators are optional two chars prefixed by hyphen,
|
69
|
+
# subfields are optional list of chars (alphanumeric)
|
70
|
+
#
|
71
|
+
# indicator spec must be two chars, but one can be * meaning "don't care".
|
72
|
+
# space to mean 'blank'
|
73
|
+
#
|
74
|
+
# "245|01|abc65:345abc:700|*5|:800"
|
75
|
+
#
|
76
|
+
# Or, for control (fixed) fields (ordinarily fields 001-010), you can include a byte slice specification,
|
77
|
+
# but can NOT include subfield or indicator specifications. Plus can use special tag "LDR" for
|
78
|
+
# the marc leader. (TODO)
|
79
|
+
#
|
80
|
+
# "008[35-37]:LDR[5]"
|
81
|
+
# => bytes 35-37 inclusive of field 008, and byte 5 of the marc leader.
|
82
|
+
#
|
83
|
+
# Returns a nested hash keyed by tags.
|
84
|
+
# { tag => {
|
85
|
+
# :subfields => ['a', 'b', '2'] # actually, a SET. may be empty or nil
|
86
|
+
# :indicators => ['1', '0'] # An array. may be empty or nil; duple, either one can be nil
|
87
|
+
# }
|
88
|
+
#}
|
89
|
+
# For byte offsets, :bytes => 12 or :bytes => (7..10)
|
90
|
+
#
|
91
|
+
# * subfields and indicators can only be provided for marc data/variable fields
|
92
|
+
# * byte slice can only be provided for marc control fields (generally tags less than 010)
|
93
|
+
#
|
94
|
+
# See tests for more examples.
|
95
|
+
def self.parse_string_spec(spec_string)
|
96
|
+
hash = {}
|
97
|
+
|
98
|
+
spec_string.split(":").each do |part|
|
99
|
+
if (part =~ /\A([a-zA-Z0-9]{3})(\|([a-z0-9\ \*]{2})\|)?([a-z0-9]*)?\Z/)
|
100
|
+
# variable field
|
101
|
+
tag, indicators, subfields = $1, $3, $4
|
102
|
+
|
103
|
+
hash[tag] ||= {}
|
104
|
+
|
105
|
+
if subfields
|
106
|
+
subfields.each_char do |subfield|
|
107
|
+
hash[tag][:subfields] ||= Array.new
|
108
|
+
hash[tag][:subfields] << subfield
|
109
|
+
end
|
110
|
+
end
|
111
|
+
if indicators
|
112
|
+
hash[tag][:indicators] = [ (indicators[0] if indicators[0] != "*"), (indicators[1] if indicators[1] != "*") ]
|
113
|
+
end
|
114
|
+
elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # "005[4-5]"
|
115
|
+
tag, byte1, byte2 = $1, $3, $5
|
116
|
+
hash[tag] ||= {}
|
117
|
+
|
118
|
+
if byte1 && byte2
|
119
|
+
hash[tag][:bytes] = ((byte1.to_i)..(byte2.to_i))
|
120
|
+
elsif byte1
|
121
|
+
hash[tag][:bytes] = byte1.to_i
|
122
|
+
end
|
123
|
+
else
|
124
|
+
raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
return hash
|
129
|
+
end
|
130
|
+
|
131
|
+
|
132
|
+
# Returns array of strings, extracted values
|
133
|
+
def extract
|
134
|
+
results = []
|
135
|
+
|
136
|
+
self.each_matching_line do |field, spec|
|
137
|
+
if control_field?(field)
|
138
|
+
results << (spec[:bytes] ? field.value.byteslice(spec[:bytes]) : field.value)
|
139
|
+
else
|
140
|
+
results.concat collect_subfields(field, spec)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
return results
|
145
|
+
end
|
146
|
+
|
147
|
+
# Yields a block for every line in source record that matches
|
148
|
+
# spec. First arg to block is MARC::Field (control or data), second
|
149
|
+
# is the hash specification that it matched on. May take account
|
150
|
+
# of options such as :alternate_script
|
151
|
+
def each_matching_line
|
152
|
+
self.marc_record.each do |field|
|
153
|
+
if (spec = spec_covering_field(field)) && matches_indicators(field, spec)
|
154
|
+
yield(field, spec)
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
# Pass in a marc data field and a hash spec, returns
|
160
|
+
# an ARRAY of one or more strings, subfields extracted
|
161
|
+
# and processed per spec. Takes account of options such
|
162
|
+
# as :seperator
|
163
|
+
def collect_subfields(field, spec)
|
164
|
+
subfields = field.subfields.collect do |subfield|
|
165
|
+
subfield.value if spec[:subfields].nil? || spec[:subfields].include?(subfield.code)
|
166
|
+
end.compact
|
167
|
+
|
168
|
+
return options[:seperator] ? [ subfields.join( options[:seperator]) ] : subfields
|
169
|
+
end
|
170
|
+
|
171
|
+
# Is there a spec covering extraction from this field?
|
172
|
+
# May return true on 880's matching other tags depending
|
173
|
+
# on value of :alternate_script
|
174
|
+
# if :alternate_script is :only, will return original spec when field is an 880.
|
175
|
+
# otherwise will always return nil for 880s, you have to handle :alternate_script :include
|
176
|
+
# elsewhere, to add in the 880 in the right order
|
177
|
+
def spec_covering_field(field)
|
178
|
+
#require 'pry'
|
179
|
+
#binding.pry if field.tag == "880"
|
180
|
+
|
181
|
+
if field.tag == "880" && options[:alternate_script] != false
|
182
|
+
# pull out the spec for corresponding original marc tag this 880 corresponds to
|
183
|
+
# Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
|
184
|
+
# to do this weird encode gymnastics, which fixes it for mysterious reasons.
|
185
|
+
orig_field = field["6"].encode(field["6"].encoding).byteslice(0,3)
|
186
|
+
field["6"] && self.spec_hash[ orig_field ]
|
187
|
+
elsif options[:alternate_script] != :only
|
188
|
+
self.spec_hash[field.tag]
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
def control_field?(field)
|
193
|
+
# should the MARC gem have a more efficient way to do this,
|
194
|
+
# define #control_field? on both ControlField and DataField?
|
195
|
+
return field.kind_of? MARC::ControlField
|
196
|
+
end
|
197
|
+
|
198
|
+
# a marc field, and an individual spec hash, {:subfields => array, :indicators => array}
|
199
|
+
def matches_indicators(field, spec)
|
200
|
+
return true if spec[:indicators].nil?
|
201
|
+
|
202
|
+
return (spec[:indicators][0].nil? || spec[:indicators][0] == field.indicator1) &&
|
203
|
+
(spec[:indicators][1].nil? || spec[:indicators][1] == field.indicator2)
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'marc'
|
2
|
+
|
3
|
+
# A Reader class that can be used with Traject::Indexer.reader, to read
|
4
|
+
# MARC records.
|
5
|
+
#
|
6
|
+
# Includes Enumerable for convenience.
|
7
|
+
#
|
8
|
+
# Reads in Marc records using ruby marc. Depends on config variables to
|
9
|
+
# determine what serialization type to expect, and other parameters controlling
|
10
|
+
# de-serialization.
|
11
|
+
#
|
12
|
+
# Settings:
|
13
|
+
# ["marc_source.type"] serialization type. default 'binary'
|
14
|
+
# * "binary". Actual marc.
|
15
|
+
# * "xml", MarcXML
|
16
|
+
# * "json". (NOT YET IMPLEMENTED) The "marc-in-json" format, encoded as newline-seperated
|
17
|
+
# json. A simplistic newline-seperated json, with no comments
|
18
|
+
# allowed, and no unescpaed internal newlines allowed in the json
|
19
|
+
# objects -- we just read line by line, and assume each line is a
|
20
|
+
# marc-in-json. http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/
|
21
|
+
# ["marc_source.xml_parser"] For XML type, which XML parser to tell Marc::Reader
|
22
|
+
# to use. Anything recognized by Marc::Reader :parser
|
23
|
+
# argument. By default, asks Marc::Reader to take
|
24
|
+
# it's best guess as to highest performance available
|
25
|
+
# installed option.
|
26
|
+
#
|
27
|
+
#
|
28
|
+
# Can NOT yet read Marc8, input is always assumed UTF8.
|
29
|
+
class Traject::MarcReader
|
30
|
+
include Enumerable
|
31
|
+
|
32
|
+
attr_reader :settings, :input_stream
|
33
|
+
|
34
|
+
@@best_xml_parser = MARC::XMLReader.best_available
|
35
|
+
|
36
|
+
def initialize(input_stream, settings)
|
37
|
+
@settings = settings
|
38
|
+
@input_stream = input_stream
|
39
|
+
end
|
40
|
+
|
41
|
+
# Creates proper kind of ruby MARC reader, depending
|
42
|
+
# on settings or guesses.
|
43
|
+
def internal_reader
|
44
|
+
unless defined? @internal_reader
|
45
|
+
@internal_reader =
|
46
|
+
case settings["marc_source.type"]
|
47
|
+
when "xml"
|
48
|
+
parser = settings["marc_source.xml_parser"] || @@best_xml_parser
|
49
|
+
MARC::XMLReader.new(self.input_stream, :parser=> parser)
|
50
|
+
else
|
51
|
+
MARC::Reader.new(self.input_stream)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
return @internal_reader
|
55
|
+
end
|
56
|
+
|
57
|
+
def each(*args, &block)
|
58
|
+
self.internal_reader.each(*args, &block)
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# From http://redcorundum.blogspot.com/2006/05/kernelqualifiedconstget.html
|
2
|
+
# Adapted into a module, rather than monkey patching it into Kernel
|
3
|
+
#
|
4
|
+
# Method to take a string constant name, including :: qualifications, and
|
5
|
+
# look up the actual constant. Looks up relative to current file.
|
6
|
+
# REspects leading ::. Etc.
|
7
|
+
module Traject::QualifiedConstGet
|
8
|
+
|
9
|
+
|
10
|
+
def qualified_const_get(str)
|
11
|
+
path = str.to_s.split('::')
|
12
|
+
from_root = path[0].empty?
|
13
|
+
if from_root
|
14
|
+
from_root = []
|
15
|
+
path = path[1..-1]
|
16
|
+
else
|
17
|
+
start_ns = ((Class === self)||(Module === self)) ? self : self.class
|
18
|
+
from_root = start_ns.to_s.split('::')
|
19
|
+
end
|
20
|
+
until from_root.empty?
|
21
|
+
begin
|
22
|
+
return (from_root+path).inject(Object) { |ns,name| ns.const_get(name) }
|
23
|
+
rescue NameError
|
24
|
+
from_root.delete_at(-1)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
path.inject(Object) { |ns,name| ns.const_get(name) }
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
require 'traject'
|
2
|
+
require 'traject/qualified_const_get'
|
3
|
+
|
4
|
+
#
|
5
|
+
# Writes to a Solr using SolrJ, and the SolrJ HttpSolrServer.
|
6
|
+
# (sub-class later for the ConcurrentUpdate server?)
|
7
|
+
#
|
8
|
+
# settings:
|
9
|
+
# [solr.url] Your solr url (required)
|
10
|
+
# [solrj_writer.server_class_name] Defaults to "HttpSolrServer". You can specify
|
11
|
+
# another Solr Server sub-class, but it has
|
12
|
+
# to take a one-arg url constructor. Maybe
|
13
|
+
# subclass this writer class and overwrite
|
14
|
+
# instantiate_solr_server! otherwise
|
15
|
+
# [solrj.jar_dir] Custom directory containing all of the SolrJ jars. All
|
16
|
+
# jars in this dir will be loaded. Otherwise,
|
17
|
+
# we load our own packaged solrj jars. This setting
|
18
|
+
# can't really be used differently in the same app instance,
|
19
|
+
# since jars are loaded globally.
|
20
|
+
# [solrj_writer.parser_class_name] A String name of a class in package
|
21
|
+
# org.apache.solr.client.solrj.impl,
|
22
|
+
# we'll instantiate one with a zero-arg
|
23
|
+
# constructor, and pass it as an arg to setParser on
|
24
|
+
# the SolrServer instance, if present.
|
25
|
+
# NOTE: For contacting a Solr 1.x server, with the
|
26
|
+
# recent version of SolrJ used by default, set to
|
27
|
+
# "XMLResponseParser"
|
28
|
+
# [solrj_writer.commit_on_close] If true (or string 'true'), send a commit to solr
|
29
|
+
# at end of #process.
|
30
|
+
class Traject::SolrJWriter
|
31
|
+
include Traject::QualifiedConstGet
|
32
|
+
|
33
|
+
attr_reader :settings
|
34
|
+
|
35
|
+
def initialize(argSettings)
|
36
|
+
@settings = argSettings
|
37
|
+
settings_check!(settings)
|
38
|
+
|
39
|
+
ensure_solrj_loaded!
|
40
|
+
|
41
|
+
solr_server # init
|
42
|
+
end
|
43
|
+
|
44
|
+
# Loads solrj if not already loaded. By loading all jars found
|
45
|
+
# in settings["solrj.jar_dir"]
|
46
|
+
def ensure_solrj_loaded!
|
47
|
+
unless defined?(HttpSolrServer) && defined?(SolrInputDocument)
|
48
|
+
require 'java'
|
49
|
+
|
50
|
+
tries = 0
|
51
|
+
begin
|
52
|
+
tries += 1
|
53
|
+
java_import org.apache.solr.client.solrj.impl.HttpSolrServer
|
54
|
+
java_import org.apache.solr.common.SolrInputDocument
|
55
|
+
rescue NameError => e
|
56
|
+
# /Users/jrochkind/code/solrj-gem/lib"
|
57
|
+
|
58
|
+
included_jar_dir = File.expand_path("../../vendor/solrj/lib", File.dirname(__FILE__))
|
59
|
+
|
60
|
+
jardir = settings["solrj.jar_dir"] || included_jar_dir
|
61
|
+
Dir.glob("#{jardir}/*.jar") do |x|
|
62
|
+
require x
|
63
|
+
end
|
64
|
+
if tries > 1
|
65
|
+
raise LoadError.new("Can not find SolrJ java classes")
|
66
|
+
else
|
67
|
+
retry
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def put(hash)
|
74
|
+
doc = SolrInputDocument.new
|
75
|
+
|
76
|
+
hash.each_pair do |key, value_array|
|
77
|
+
value_array.each do |value|
|
78
|
+
doc.addField( key, value )
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# TODO: Buffer docs internally, add in arrays, one http
|
83
|
+
# transaction per array. Is what solrj wiki recommends.
|
84
|
+
solr_server.add(doc)
|
85
|
+
end
|
86
|
+
|
87
|
+
def close
|
88
|
+
solr_server.commit if settings["solrj_writer.commit_on_close"].to_s == "true"
|
89
|
+
|
90
|
+
solr_server.shutdown
|
91
|
+
@solr_server = nil
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
def solr_server
|
96
|
+
@solr_server ||= instantiate_solr_server!
|
97
|
+
end
|
98
|
+
attr_writer :solr_server # mainly for testing
|
99
|
+
|
100
|
+
# Instantiates a solr server of class settings["solrj_writer.server_class_name"] or "HttpSolrServer"
|
101
|
+
# and initializes it with settings["solr.url"]
|
102
|
+
def instantiate_solr_server!
|
103
|
+
server_class = qualified_const_get( settings["solrj_writer.server_class_name"] || "HttpSolrServer" )
|
104
|
+
server = server_class.new( settings["solr.url"].to_s );
|
105
|
+
|
106
|
+
if parser_name = settings["solrj_writer.parser_class_name"]
|
107
|
+
parser = org.apache.solr.client.solrj.impl.const_get(parser_name).new
|
108
|
+
server.setParser( parser )
|
109
|
+
end
|
110
|
+
|
111
|
+
server
|
112
|
+
end
|
113
|
+
|
114
|
+
def settings_check!(settings)
|
115
|
+
unless settings.has_key?("solr.url") && ! settings["solr.url"].nil?
|
116
|
+
raise ArgumentError.new("SolrJWriter requires a 'solr.url' solr url in settings")
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
@@ -0,0 +1,184 @@
|
|
1
|
+
require 'traject'
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
|
6
|
+
module Traject
|
7
|
+
# A TranslationMap is basically just something that has a hash-like #[]
|
8
|
+
# method to map from input strings to output strings:
|
9
|
+
#
|
10
|
+
# translation_map["some_input"] #=> some_output
|
11
|
+
#
|
12
|
+
# Input is assumed to always be string, output is either string
|
13
|
+
# or array of strings.
|
14
|
+
#
|
15
|
+
# What makes it more useful than a stunted hash is it's ability to load
|
16
|
+
# the hash definitions from configuration files, either pure ruby or
|
17
|
+
# yaml.
|
18
|
+
#
|
19
|
+
# TranslationMap.new("dir/some_file")
|
20
|
+
#
|
21
|
+
# Will look through the entire ruby $LOAD_PATH, for a translation_maps subdir
|
22
|
+
# that contains either some_file.rb OR some_file.yaml
|
23
|
+
# * Looks for "/translation_maps" subdir in load paths, so
|
24
|
+
# for instance you can have a gem that keeps translation maps
|
25
|
+
# in ./lib/translation_maps, and it Just Works.
|
26
|
+
# * Note you do NOT supply the ".rb" or ".yaml" suffix yourself,
|
27
|
+
# it'll use whichever it finds (allows calling code to not care which is used).
|
28
|
+
#
|
29
|
+
# Ruby files just need to have their last line eval to a hash. They file
|
30
|
+
# will be run through `eval`, don't do it with untrusted content (naturally)
|
31
|
+
#
|
32
|
+
# You can also pass in a Hash for consistency to TranslationMap.new, although
|
33
|
+
# I don't know why you'd want to.
|
34
|
+
#
|
35
|
+
# == Special default handling
|
36
|
+
#
|
37
|
+
# The key "__default__" in the hash is treated specially. If set to a string,
|
38
|
+
# that string will be returned by the TranslationMap for any input not otherwise
|
39
|
+
# included. If set to the special string "__passthrough__", then for input not
|
40
|
+
# mapped, the original input string will be returned.
|
41
|
+
#
|
42
|
+
# This is most useful for YAML definition files, if you are using an actual ruby
|
43
|
+
# hash, you could just set the hash to do what you want using Hash#default_proc
|
44
|
+
# etc.
|
45
|
+
#
|
46
|
+
# Or, when calling TranslationMap.new(), you can pass in options over-riding special
|
47
|
+
# key too:
|
48
|
+
#
|
49
|
+
# TranslationMap.new("something", :default => "foo")
|
50
|
+
# TranslationMap.new("something", :default => :passthrough)
|
51
|
+
#
|
52
|
+
# == Output: String or array of strings
|
53
|
+
#
|
54
|
+
# The output can be a string or an array of strings, or nil. It should not be anything
|
55
|
+
# When used with the #translate_array! method, one string can be replaced by multiple values
|
56
|
+
# (array of strings) or removed (nil)
|
57
|
+
#
|
58
|
+
# == Caching
|
59
|
+
# Lookup and loading of configuration files will be cached, for efficiency.
|
60
|
+
# You can reset with `TranslationMap.reset_cache!`
|
61
|
+
#
|
62
|
+
# == YAML example:
|
63
|
+
#
|
64
|
+
# key: value
|
65
|
+
# key2: value2 multiple words fine
|
66
|
+
# key2b: "Although you can use quotes if you want: Or need."
|
67
|
+
# key3:
|
68
|
+
# - array
|
69
|
+
# - of
|
70
|
+
# - values look like this
|
71
|
+
class TranslationMap
|
72
|
+
class Cache
|
73
|
+
def initialize
|
74
|
+
@cached = Hash.new
|
75
|
+
end
|
76
|
+
|
77
|
+
# Returns an actual Hash -- or nil if none found.
|
78
|
+
def lookup(path)
|
79
|
+
unless @cached.has_key?(path)
|
80
|
+
@cached[path] = _lookup!(path)
|
81
|
+
end
|
82
|
+
return @cached[path]
|
83
|
+
end
|
84
|
+
|
85
|
+
# force lookup, without using cache.
|
86
|
+
# used by cache. Returns the actual hash.
|
87
|
+
# Returns nil if none found.
|
88
|
+
# May raise on syntax error in file being loaded.
|
89
|
+
def _lookup!(path)
|
90
|
+
found = nil
|
91
|
+
|
92
|
+
$LOAD_PATH.each do |base|
|
93
|
+
rb_file = File.join( base, "translation_maps", "#{path}.rb" )
|
94
|
+
yaml_file = File.join( base, "translation_maps", "#{path}.yaml" )
|
95
|
+
|
96
|
+
if File.exists? rb_file
|
97
|
+
found = eval( File.open(rb_file).read , binding, rb_file )
|
98
|
+
break
|
99
|
+
elsif File.exists? yaml_file
|
100
|
+
found = YAML.load_file(yaml_file)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
return found
|
105
|
+
end
|
106
|
+
|
107
|
+
def reset_cache!
|
108
|
+
@cached.clear
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
attr_reader :hash
|
114
|
+
attr_reader :default
|
115
|
+
|
116
|
+
class << self
|
117
|
+
attr_accessor :cache
|
118
|
+
def reset_cache!
|
119
|
+
cache.reset_cache!
|
120
|
+
end
|
121
|
+
end
|
122
|
+
self.cache = Cache.new
|
123
|
+
|
124
|
+
|
125
|
+
def initialize(defn, options = {})
|
126
|
+
if defn.kind_of? Hash
|
127
|
+
@hash = defn
|
128
|
+
else
|
129
|
+
@hash = self.class.cache.lookup(defn)
|
130
|
+
raise NotFound.new(defn) if @hash.nil?
|
131
|
+
end
|
132
|
+
|
133
|
+
if options[:default]
|
134
|
+
@default = options[:default]
|
135
|
+
elsif @hash.has_key? "__default__"
|
136
|
+
@default = @hash.delete("__default__")
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def [](key)
|
141
|
+
if self.default && (! @hash.has_key?(key))
|
142
|
+
if self.default == "__passthrough__"
|
143
|
+
return key
|
144
|
+
else
|
145
|
+
return self.default
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
@hash[key]
|
150
|
+
end
|
151
|
+
alias_method :map, :[]
|
152
|
+
|
153
|
+
# Run every element of an array through this translation map,
|
154
|
+
# return the resulting array. If translation map returns nil,
|
155
|
+
# original element will be missing from output.
|
156
|
+
#
|
157
|
+
# If an input maps to an array, each element of the array will be flattened
|
158
|
+
# into the output.
|
159
|
+
#
|
160
|
+
# If an input maps to nil, it will cause the input element to be removed
|
161
|
+
# entirely.
|
162
|
+
def translate_array(array)
|
163
|
+
array.each_with_object([]) do |input_element, output_array|
|
164
|
+
output_element = self.map(input_element)
|
165
|
+
if output_element.kind_of? Array
|
166
|
+
output_array.concat output_element
|
167
|
+
elsif ! output_element.nil?
|
168
|
+
output_array << output_element
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
def translate_array!(array)
|
174
|
+
array.replace( self.translate_array(array))
|
175
|
+
end
|
176
|
+
|
177
|
+
class NotFound < Exception
|
178
|
+
def initialize(path)
|
179
|
+
super("No translation map definition file found at '#{path}[.rb|.yaml]' in load path")
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
end
|