traject 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +18 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +346 -0
- data/Rakefile +16 -0
- data/bin/traject +153 -0
- data/doc/macros.md +103 -0
- data/doc/settings.md +34 -0
- data/lib/traject.rb +10 -0
- data/lib/traject/indexer.rb +196 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +145 -0
- data/lib/traject/marc_extractor.rb +206 -0
- data/lib/traject/marc_reader.rb +61 -0
- data/lib/traject/qualified_const_get.rb +30 -0
- data/lib/traject/solrj_writer.rb +120 -0
- data/lib/traject/translation_map.rb +184 -0
- data/lib/traject/version.rb +3 -0
- data/test/indexer/macros_marc21_test.rb +146 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +120 -0
- data/test/indexer/read_write_test.rb +47 -0
- data/test/indexer/settings_test.rb +65 -0
- data/test/marc_extractor_test.rb +168 -0
- data/test/marc_reader_test.rb +29 -0
- data/test/solrj_writer_test.rb +106 -0
- data/test/test_helper.rb +28 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/translation_map_test.rb +98 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +30 -0
- data/vendor/solrj/README +8 -0
- data/vendor/solrj/build.xml +39 -0
- data/vendor/solrj/ivy.xml +16 -0
- data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
- data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
- data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
- data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
- data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
- data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
- data/vendor/solrj/lib/noggit-0.5.jar +0 -0
- data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
- data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
- data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
- data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
- metadata +264 -0
@@ -0,0 +1,206 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Traject
|
4
|
+
# MarcExtractor is a class for extracting lists of strings from a MARC::Record,
|
5
|
+
# according to specifications. See #parse_string_spec for description of string
|
6
|
+
# string arguments used to specify extraction. See #initialize for options
|
7
|
+
# that can be set controlling extraction.
|
8
|
+
#
|
9
|
+
# Examples:
|
10
|
+
#
|
11
|
+
# array_of_stuff = MarcExtractor.new(marc_record, "001:245abc:700a").extract
|
12
|
+
# values = MarcExtractor.new(marc_record, "040a", :seperator => nil).extract
|
13
|
+
#
|
14
|
+
class MarcExtractor
|
15
|
+
attr_accessor :options, :marc_record, :spec_hash
|
16
|
+
|
17
|
+
|
18
|
+
# Convenience method to construct a MarcExtractor object and
|
19
|
+
# run extract on it.
|
20
|
+
#
|
21
|
+
# First arg is a marc record.
|
22
|
+
#
|
23
|
+
# Second arg is either a string that will be given to parse_string_spec,
|
24
|
+
# OR a hash that's the return value of parse_string_spec.
|
25
|
+
#
|
26
|
+
# Third arg is an optional options hash that will be passed as
|
27
|
+
# third arg of MarcExtractor constructor.
|
28
|
+
def self.extract_by_spec(marc_record, specification, options = {})
|
29
|
+
(raise IllegalArgument, "first argument must not be nil") if marc_record.nil?
|
30
|
+
|
31
|
+
unless specification.kind_of? Hash
|
32
|
+
specification = self.parse_string_spec(specification)
|
33
|
+
end
|
34
|
+
|
35
|
+
Traject::MarcExtractor.new(marc_record, specification, options).extract
|
36
|
+
end
|
37
|
+
|
38
|
+
# Take a hash that's the output of #parse_string_spec, return
|
39
|
+
# an array of strings extracted from a marc record accordingly
|
40
|
+
#
|
41
|
+
# options:
|
42
|
+
#
|
43
|
+
# [:seperator] default ' ' (space), what to use to seperate
|
44
|
+
# subfield values when joining strings
|
45
|
+
#
|
46
|
+
# [:alternate_script] default :include, include linked 880s for tags
|
47
|
+
# that match spec. Also:
|
48
|
+
# * false => do not include.
|
49
|
+
# * :only => only include linked 880s, not original
|
50
|
+
def initialize(marc_record, spec_hash, options = {})
|
51
|
+
self.options = {
|
52
|
+
:seperator => ' ',
|
53
|
+
:alternate_script => :include
|
54
|
+
}.merge(options)
|
55
|
+
|
56
|
+
raise IllegalArgumentException("second arg to MarcExtractor.new must be a Hash specification object") unless spec_hash.kind_of? Hash
|
57
|
+
|
58
|
+
self.marc_record = marc_record
|
59
|
+
self.spec_hash = spec_hash
|
60
|
+
end
|
61
|
+
|
62
|
+
# Converts from a string marc spec like "245abc:700a" to a nested hash used internally
|
63
|
+
# to represent the specification.
|
64
|
+
#
|
65
|
+
# a String specification is a string of form:
|
66
|
+
# {tag}{|indicators|}{subfields} seperated by colons
|
67
|
+
# tag is three chars (usually but not neccesarily numeric),
|
68
|
+
# indicators are optional two chars prefixed by hyphen,
|
69
|
+
# subfields are optional list of chars (alphanumeric)
|
70
|
+
#
|
71
|
+
# indicator spec must be two chars, but one can be * meaning "don't care".
|
72
|
+
# space to mean 'blank'
|
73
|
+
#
|
74
|
+
# "245|01|abc65:345abc:700|*5|:800"
|
75
|
+
#
|
76
|
+
# Or, for control (fixed) fields (ordinarily fields 001-010), you can include a byte slice specification,
|
77
|
+
# but can NOT include subfield or indicator specifications. Plus can use special tag "LDR" for
|
78
|
+
# the marc leader. (TODO)
|
79
|
+
#
|
80
|
+
# "008[35-37]:LDR[5]"
|
81
|
+
# => bytes 35-37 inclusive of field 008, and byte 5 of the marc leader.
|
82
|
+
#
|
83
|
+
# Returns a nested hash keyed by tags.
|
84
|
+
# { tag => {
|
85
|
+
# :subfields => ['a', 'b', '2'] # actually, a SET. may be empty or nil
|
86
|
+
# :indicators => ['1', '0'] # An array. may be empty or nil; duple, either one can be nil
|
87
|
+
# }
|
88
|
+
#}
|
89
|
+
# For byte offsets, :bytes => 12 or :bytes => (7..10)
|
90
|
+
#
|
91
|
+
# * subfields and indicators can only be provided for marc data/variable fields
|
92
|
+
# * byte slice can only be provided for marc control fields (generally tags less than 010)
|
93
|
+
#
|
94
|
+
# See tests for more examples.
|
95
|
+
def self.parse_string_spec(spec_string)
|
96
|
+
hash = {}
|
97
|
+
|
98
|
+
spec_string.split(":").each do |part|
|
99
|
+
if (part =~ /\A([a-zA-Z0-9]{3})(\|([a-z0-9\ \*]{2})\|)?([a-z0-9]*)?\Z/)
|
100
|
+
# variable field
|
101
|
+
tag, indicators, subfields = $1, $3, $4
|
102
|
+
|
103
|
+
hash[tag] ||= {}
|
104
|
+
|
105
|
+
if subfields
|
106
|
+
subfields.each_char do |subfield|
|
107
|
+
hash[tag][:subfields] ||= Array.new
|
108
|
+
hash[tag][:subfields] << subfield
|
109
|
+
end
|
110
|
+
end
|
111
|
+
if indicators
|
112
|
+
hash[tag][:indicators] = [ (indicators[0] if indicators[0] != "*"), (indicators[1] if indicators[1] != "*") ]
|
113
|
+
end
|
114
|
+
elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # "005[4-5]"
|
115
|
+
tag, byte1, byte2 = $1, $3, $5
|
116
|
+
hash[tag] ||= {}
|
117
|
+
|
118
|
+
if byte1 && byte2
|
119
|
+
hash[tag][:bytes] = ((byte1.to_i)..(byte2.to_i))
|
120
|
+
elsif byte1
|
121
|
+
hash[tag][:bytes] = byte1.to_i
|
122
|
+
end
|
123
|
+
else
|
124
|
+
raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
return hash
|
129
|
+
end
|
130
|
+
|
131
|
+
|
132
|
+
# Returns array of strings, extracted values
|
133
|
+
def extract
|
134
|
+
results = []
|
135
|
+
|
136
|
+
self.each_matching_line do |field, spec|
|
137
|
+
if control_field?(field)
|
138
|
+
results << (spec[:bytes] ? field.value.byteslice(spec[:bytes]) : field.value)
|
139
|
+
else
|
140
|
+
results.concat collect_subfields(field, spec)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
return results
|
145
|
+
end
|
146
|
+
|
147
|
+
# Yields a block for every line in source record that matches
|
148
|
+
# spec. First arg to block is MARC::Field (control or data), second
|
149
|
+
# is the hash specification that it matched on. May take account
|
150
|
+
# of options such as :alternate_script
|
151
|
+
def each_matching_line
|
152
|
+
self.marc_record.each do |field|
|
153
|
+
if (spec = spec_covering_field(field)) && matches_indicators(field, spec)
|
154
|
+
yield(field, spec)
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
# Pass in a marc data field and a hash spec, returns
|
160
|
+
# an ARRAY of one or more strings, subfields extracted
|
161
|
+
# and processed per spec. Takes account of options such
|
162
|
+
# as :seperator
|
163
|
+
def collect_subfields(field, spec)
|
164
|
+
subfields = field.subfields.collect do |subfield|
|
165
|
+
subfield.value if spec[:subfields].nil? || spec[:subfields].include?(subfield.code)
|
166
|
+
end.compact
|
167
|
+
|
168
|
+
return options[:seperator] ? [ subfields.join( options[:seperator]) ] : subfields
|
169
|
+
end
|
170
|
+
|
171
|
+
# Is there a spec covering extraction from this field?
|
172
|
+
# May return true on 880's matching other tags depending
|
173
|
+
# on value of :alternate_script
|
174
|
+
# if :alternate_script is :only, will return original spec when field is an 880.
|
175
|
+
# otherwise will always return nil for 880s, you have to handle :alternate_script :include
|
176
|
+
# elsewhere, to add in the 880 in the right order
|
177
|
+
def spec_covering_field(field)
|
178
|
+
#require 'pry'
|
179
|
+
#binding.pry if field.tag == "880"
|
180
|
+
|
181
|
+
if field.tag == "880" && options[:alternate_script] != false
|
182
|
+
# pull out the spec for corresponding original marc tag this 880 corresponds to
|
183
|
+
# Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
|
184
|
+
# to do this weird encode gymnastics, which fixes it for mysterious reasons.
|
185
|
+
orig_field = field["6"].encode(field["6"].encoding).byteslice(0,3)
|
186
|
+
field["6"] && self.spec_hash[ orig_field ]
|
187
|
+
elsif options[:alternate_script] != :only
|
188
|
+
self.spec_hash[field.tag]
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
def control_field?(field)
|
193
|
+
# should the MARC gem have a more efficient way to do this,
|
194
|
+
# define #control_field? on both ControlField and DataField?
|
195
|
+
return field.kind_of? MARC::ControlField
|
196
|
+
end
|
197
|
+
|
198
|
+
# a marc field, and an individual spec hash, {:subfields => array, :indicators => array}
|
199
|
+
def matches_indicators(field, spec)
|
200
|
+
return true if spec[:indicators].nil?
|
201
|
+
|
202
|
+
return (spec[:indicators][0].nil? || spec[:indicators][0] == field.indicator1) &&
|
203
|
+
(spec[:indicators][1].nil? || spec[:indicators][1] == field.indicator2)
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'marc'
|
2
|
+
|
3
|
+
# A Reader class that can be used with Traject::Indexer.reader, to read
|
4
|
+
# MARC records.
|
5
|
+
#
|
6
|
+
# Includes Enumerable for convenience.
|
7
|
+
#
|
8
|
+
# Reads in Marc records using ruby marc. Depends on config variables to
|
9
|
+
# determine what serialization type to expect, and other parameters controlling
|
10
|
+
# de-serialization.
|
11
|
+
#
|
12
|
+
# Settings:
|
13
|
+
# ["marc_source.type"] serialization type. default 'binary'
|
14
|
+
# * "binary". Actual marc.
|
15
|
+
# * "xml", MarcXML
|
16
|
+
# * "json". (NOT YET IMPLEMENTED) The "marc-in-json" format, encoded as newline-seperated
|
17
|
+
# json. A simplistic newline-seperated json, with no comments
|
18
|
+
# allowed, and no unescpaed internal newlines allowed in the json
|
19
|
+
# objects -- we just read line by line, and assume each line is a
|
20
|
+
# marc-in-json. http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/
|
21
|
+
# ["marc_source.xml_parser"] For XML type, which XML parser to tell Marc::Reader
|
22
|
+
# to use. Anything recognized by Marc::Reader :parser
|
23
|
+
# argument. By default, asks Marc::Reader to take
|
24
|
+
# it's best guess as to highest performance available
|
25
|
+
# installed option.
|
26
|
+
#
|
27
|
+
#
|
28
|
+
# Can NOT yet read Marc8, input is always assumed UTF8.
|
29
|
+
class Traject::MarcReader
|
30
|
+
include Enumerable
|
31
|
+
|
32
|
+
attr_reader :settings, :input_stream
|
33
|
+
|
34
|
+
@@best_xml_parser = MARC::XMLReader.best_available
|
35
|
+
|
36
|
+
def initialize(input_stream, settings)
|
37
|
+
@settings = settings
|
38
|
+
@input_stream = input_stream
|
39
|
+
end
|
40
|
+
|
41
|
+
# Creates proper kind of ruby MARC reader, depending
|
42
|
+
# on settings or guesses.
|
43
|
+
def internal_reader
|
44
|
+
unless defined? @internal_reader
|
45
|
+
@internal_reader =
|
46
|
+
case settings["marc_source.type"]
|
47
|
+
when "xml"
|
48
|
+
parser = settings["marc_source.xml_parser"] || @@best_xml_parser
|
49
|
+
MARC::XMLReader.new(self.input_stream, :parser=> parser)
|
50
|
+
else
|
51
|
+
MARC::Reader.new(self.input_stream)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
return @internal_reader
|
55
|
+
end
|
56
|
+
|
57
|
+
def each(*args, &block)
|
58
|
+
self.internal_reader.each(*args, &block)
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# From http://redcorundum.blogspot.com/2006/05/kernelqualifiedconstget.html
|
2
|
+
# Adapted into a module, rather than monkey patching it into Kernel
|
3
|
+
#
|
4
|
+
# Method to take a string constant name, including :: qualifications, and
|
5
|
+
# look up the actual constant. Looks up relative to current file.
|
6
|
+
# REspects leading ::. Etc.
|
7
|
+
module Traject::QualifiedConstGet
|
8
|
+
|
9
|
+
|
10
|
+
def qualified_const_get(str)
|
11
|
+
path = str.to_s.split('::')
|
12
|
+
from_root = path[0].empty?
|
13
|
+
if from_root
|
14
|
+
from_root = []
|
15
|
+
path = path[1..-1]
|
16
|
+
else
|
17
|
+
start_ns = ((Class === self)||(Module === self)) ? self : self.class
|
18
|
+
from_root = start_ns.to_s.split('::')
|
19
|
+
end
|
20
|
+
until from_root.empty?
|
21
|
+
begin
|
22
|
+
return (from_root+path).inject(Object) { |ns,name| ns.const_get(name) }
|
23
|
+
rescue NameError
|
24
|
+
from_root.delete_at(-1)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
path.inject(Object) { |ns,name| ns.const_get(name) }
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
require 'traject'
|
2
|
+
require 'traject/qualified_const_get'
|
3
|
+
|
4
|
+
#
|
5
|
+
# Writes to a Solr using SolrJ, and the SolrJ HttpSolrServer.
|
6
|
+
# (sub-class later for the ConcurrentUpdate server?)
|
7
|
+
#
|
8
|
+
# settings:
|
9
|
+
# [solr.url] Your solr url (required)
|
10
|
+
# [solrj_writer.server_class_name] Defaults to "HttpSolrServer". You can specify
|
11
|
+
# another Solr Server sub-class, but it has
|
12
|
+
# to take a one-arg url constructor. Maybe
|
13
|
+
# subclass this writer class and overwrite
|
14
|
+
# instantiate_solr_server! otherwise
|
15
|
+
# [solrj.jar_dir] Custom directory containing all of the SolrJ jars. All
|
16
|
+
# jars in this dir will be loaded. Otherwise,
|
17
|
+
# we load our own packaged solrj jars. This setting
|
18
|
+
# can't really be used differently in the same app instance,
|
19
|
+
# since jars are loaded globally.
|
20
|
+
# [solrj_writer.parser_class_name] A String name of a class in package
|
21
|
+
# org.apache.solr.client.solrj.impl,
|
22
|
+
# we'll instantiate one with a zero-arg
|
23
|
+
# constructor, and pass it as an arg to setParser on
|
24
|
+
# the SolrServer instance, if present.
|
25
|
+
# NOTE: For contacting a Solr 1.x server, with the
|
26
|
+
# recent version of SolrJ used by default, set to
|
27
|
+
# "XMLResponseParser"
|
28
|
+
# [solrj_writer.commit_on_close] If true (or string 'true'), send a commit to solr
|
29
|
+
# at end of #process.
|
30
|
+
class Traject::SolrJWriter
|
31
|
+
include Traject::QualifiedConstGet
|
32
|
+
|
33
|
+
attr_reader :settings
|
34
|
+
|
35
|
+
def initialize(argSettings)
|
36
|
+
@settings = argSettings
|
37
|
+
settings_check!(settings)
|
38
|
+
|
39
|
+
ensure_solrj_loaded!
|
40
|
+
|
41
|
+
solr_server # init
|
42
|
+
end
|
43
|
+
|
44
|
+
# Loads solrj if not already loaded. By loading all jars found
|
45
|
+
# in settings["solrj.jar_dir"]
|
46
|
+
def ensure_solrj_loaded!
|
47
|
+
unless defined?(HttpSolrServer) && defined?(SolrInputDocument)
|
48
|
+
require 'java'
|
49
|
+
|
50
|
+
tries = 0
|
51
|
+
begin
|
52
|
+
tries += 1
|
53
|
+
java_import org.apache.solr.client.solrj.impl.HttpSolrServer
|
54
|
+
java_import org.apache.solr.common.SolrInputDocument
|
55
|
+
rescue NameError => e
|
56
|
+
# /Users/jrochkind/code/solrj-gem/lib"
|
57
|
+
|
58
|
+
included_jar_dir = File.expand_path("../../vendor/solrj/lib", File.dirname(__FILE__))
|
59
|
+
|
60
|
+
jardir = settings["solrj.jar_dir"] || included_jar_dir
|
61
|
+
Dir.glob("#{jardir}/*.jar") do |x|
|
62
|
+
require x
|
63
|
+
end
|
64
|
+
if tries > 1
|
65
|
+
raise LoadError.new("Can not find SolrJ java classes")
|
66
|
+
else
|
67
|
+
retry
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def put(hash)
|
74
|
+
doc = SolrInputDocument.new
|
75
|
+
|
76
|
+
hash.each_pair do |key, value_array|
|
77
|
+
value_array.each do |value|
|
78
|
+
doc.addField( key, value )
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# TODO: Buffer docs internally, add in arrays, one http
|
83
|
+
# transaction per array. Is what solrj wiki recommends.
|
84
|
+
solr_server.add(doc)
|
85
|
+
end
|
86
|
+
|
87
|
+
def close
|
88
|
+
solr_server.commit if settings["solrj_writer.commit_on_close"].to_s == "true"
|
89
|
+
|
90
|
+
solr_server.shutdown
|
91
|
+
@solr_server = nil
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
def solr_server
|
96
|
+
@solr_server ||= instantiate_solr_server!
|
97
|
+
end
|
98
|
+
attr_writer :solr_server # mainly for testing
|
99
|
+
|
100
|
+
# Instantiates a solr server of class settings["solrj_writer.server_class_name"] or "HttpSolrServer"
|
101
|
+
# and initializes it with settings["solr.url"]
|
102
|
+
def instantiate_solr_server!
|
103
|
+
server_class = qualified_const_get( settings["solrj_writer.server_class_name"] || "HttpSolrServer" )
|
104
|
+
server = server_class.new( settings["solr.url"].to_s );
|
105
|
+
|
106
|
+
if parser_name = settings["solrj_writer.parser_class_name"]
|
107
|
+
parser = org.apache.solr.client.solrj.impl.const_get(parser_name).new
|
108
|
+
server.setParser( parser )
|
109
|
+
end
|
110
|
+
|
111
|
+
server
|
112
|
+
end
|
113
|
+
|
114
|
+
def settings_check!(settings)
|
115
|
+
unless settings.has_key?("solr.url") && ! settings["solr.url"].nil?
|
116
|
+
raise ArgumentError.new("SolrJWriter requires a 'solr.url' solr url in settings")
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
@@ -0,0 +1,184 @@
|
|
1
|
+
require 'traject'
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
|
6
|
+
module Traject
|
7
|
+
# A TranslationMap is basically just something that has a hash-like #[]
|
8
|
+
# method to map from input strings to output strings:
|
9
|
+
#
|
10
|
+
# translation_map["some_input"] #=> some_output
|
11
|
+
#
|
12
|
+
# Input is assumed to always be string, output is either string
|
13
|
+
# or array of strings.
|
14
|
+
#
|
15
|
+
# What makes it more useful than a stunted hash is it's ability to load
|
16
|
+
# the hash definitions from configuration files, either pure ruby or
|
17
|
+
# yaml.
|
18
|
+
#
|
19
|
+
# TranslationMap.new("dir/some_file")
|
20
|
+
#
|
21
|
+
# Will look through the entire ruby $LOAD_PATH, for a translation_maps subdir
|
22
|
+
# that contains either some_file.rb OR some_file.yaml
|
23
|
+
# * Looks for "/translation_maps" subdir in load paths, so
|
24
|
+
# for instance you can have a gem that keeps translation maps
|
25
|
+
# in ./lib/translation_maps, and it Just Works.
|
26
|
+
# * Note you do NOT supply the ".rb" or ".yaml" suffix yourself,
|
27
|
+
# it'll use whichever it finds (allows calling code to not care which is used).
|
28
|
+
#
|
29
|
+
# Ruby files just need to have their last line eval to a hash. They file
|
30
|
+
# will be run through `eval`, don't do it with untrusted content (naturally)
|
31
|
+
#
|
32
|
+
# You can also pass in a Hash for consistency to TranslationMap.new, although
|
33
|
+
# I don't know why you'd want to.
|
34
|
+
#
|
35
|
+
# == Special default handling
|
36
|
+
#
|
37
|
+
# The key "__default__" in the hash is treated specially. If set to a string,
|
38
|
+
# that string will be returned by the TranslationMap for any input not otherwise
|
39
|
+
# included. If set to the special string "__passthrough__", then for input not
|
40
|
+
# mapped, the original input string will be returned.
|
41
|
+
#
|
42
|
+
# This is most useful for YAML definition files, if you are using an actual ruby
|
43
|
+
# hash, you could just set the hash to do what you want using Hash#default_proc
|
44
|
+
# etc.
|
45
|
+
#
|
46
|
+
# Or, when calling TranslationMap.new(), you can pass in options over-riding special
|
47
|
+
# key too:
|
48
|
+
#
|
49
|
+
# TranslationMap.new("something", :default => "foo")
|
50
|
+
# TranslationMap.new("something", :default => :passthrough)
|
51
|
+
#
|
52
|
+
# == Output: String or array of strings
|
53
|
+
#
|
54
|
+
# The output can be a string or an array of strings, or nil. It should not be anything
|
55
|
+
# When used with the #translate_array! method, one string can be replaced by multiple values
|
56
|
+
# (array of strings) or removed (nil)
|
57
|
+
#
|
58
|
+
# == Caching
|
59
|
+
# Lookup and loading of configuration files will be cached, for efficiency.
|
60
|
+
# You can reset with `TranslationMap.reset_cache!`
|
61
|
+
#
|
62
|
+
# == YAML example:
|
63
|
+
#
|
64
|
+
# key: value
|
65
|
+
# key2: value2 multiple words fine
|
66
|
+
# key2b: "Although you can use quotes if you want: Or need."
|
67
|
+
# key3:
|
68
|
+
# - array
|
69
|
+
# - of
|
70
|
+
# - values look like this
|
71
|
+
class TranslationMap
|
72
|
+
class Cache
|
73
|
+
def initialize
|
74
|
+
@cached = Hash.new
|
75
|
+
end
|
76
|
+
|
77
|
+
# Returns an actual Hash -- or nil if none found.
|
78
|
+
def lookup(path)
|
79
|
+
unless @cached.has_key?(path)
|
80
|
+
@cached[path] = _lookup!(path)
|
81
|
+
end
|
82
|
+
return @cached[path]
|
83
|
+
end
|
84
|
+
|
85
|
+
# force lookup, without using cache.
|
86
|
+
# used by cache. Returns the actual hash.
|
87
|
+
# Returns nil if none found.
|
88
|
+
# May raise on syntax error in file being loaded.
|
89
|
+
def _lookup!(path)
|
90
|
+
found = nil
|
91
|
+
|
92
|
+
$LOAD_PATH.each do |base|
|
93
|
+
rb_file = File.join( base, "translation_maps", "#{path}.rb" )
|
94
|
+
yaml_file = File.join( base, "translation_maps", "#{path}.yaml" )
|
95
|
+
|
96
|
+
if File.exists? rb_file
|
97
|
+
found = eval( File.open(rb_file).read , binding, rb_file )
|
98
|
+
break
|
99
|
+
elsif File.exists? yaml_file
|
100
|
+
found = YAML.load_file(yaml_file)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
return found
|
105
|
+
end
|
106
|
+
|
107
|
+
def reset_cache!
|
108
|
+
@cached.clear
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
attr_reader :hash
|
114
|
+
attr_reader :default
|
115
|
+
|
116
|
+
class << self
|
117
|
+
attr_accessor :cache
|
118
|
+
def reset_cache!
|
119
|
+
cache.reset_cache!
|
120
|
+
end
|
121
|
+
end
|
122
|
+
self.cache = Cache.new
|
123
|
+
|
124
|
+
|
125
|
+
def initialize(defn, options = {})
|
126
|
+
if defn.kind_of? Hash
|
127
|
+
@hash = defn
|
128
|
+
else
|
129
|
+
@hash = self.class.cache.lookup(defn)
|
130
|
+
raise NotFound.new(defn) if @hash.nil?
|
131
|
+
end
|
132
|
+
|
133
|
+
if options[:default]
|
134
|
+
@default = options[:default]
|
135
|
+
elsif @hash.has_key? "__default__"
|
136
|
+
@default = @hash.delete("__default__")
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def [](key)
|
141
|
+
if self.default && (! @hash.has_key?(key))
|
142
|
+
if self.default == "__passthrough__"
|
143
|
+
return key
|
144
|
+
else
|
145
|
+
return self.default
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
@hash[key]
|
150
|
+
end
|
151
|
+
alias_method :map, :[]
|
152
|
+
|
153
|
+
# Run every element of an array through this translation map,
|
154
|
+
# return the resulting array. If translation map returns nil,
|
155
|
+
# original element will be missing from output.
|
156
|
+
#
|
157
|
+
# If an input maps to an array, each element of the array will be flattened
|
158
|
+
# into the output.
|
159
|
+
#
|
160
|
+
# If an input maps to nil, it will cause the input element to be removed
|
161
|
+
# entirely.
|
162
|
+
def translate_array(array)
|
163
|
+
array.each_with_object([]) do |input_element, output_array|
|
164
|
+
output_element = self.map(input_element)
|
165
|
+
if output_element.kind_of? Array
|
166
|
+
output_array.concat output_element
|
167
|
+
elsif ! output_element.nil?
|
168
|
+
output_array << output_element
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
def translate_array!(array)
|
174
|
+
array.replace( self.translate_array(array))
|
175
|
+
end
|
176
|
+
|
177
|
+
class NotFound < Exception
|
178
|
+
def initialize(path)
|
179
|
+
super("No translation map definition file found at '#{path}[.rb|.yaml]' in load path")
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
end
|