mabmapper 1.0.0.pre18 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/mabmapper/cli.rb CHANGED
@@ -3,6 +3,7 @@
3
3
  #
4
4
  require 'mabmapper/elasticsearch_writer'
5
5
  require 'mabmapper/tar_writer'
6
+ require 'metacrunch/ubpb/transformations/mab_to_primo'
6
7
 
7
8
  module Mabmapper
8
9
  class Cli
@@ -12,7 +13,7 @@ module Mabmapper
12
13
  def initialize
13
14
  @options = {}
14
15
  parse_command_line!
15
- load_engine!
16
+ @transformation = Metacrunch::UBPB::Transformations::MabToPrimo.new
16
17
  process_files!
17
18
  end
18
19
 
@@ -35,11 +36,6 @@ module Mabmapper
35
36
  @options[:debug] = true
36
37
  end
37
38
 
38
- @options[:debug_fields] = []
39
- opts.on( '-f', '--debug-fields a,b,c', Array, "If debug mode is on only fields matching the given names will be debugged." ) do |fields|
40
- @options[:debug_fields] = fields
41
- end
42
-
43
39
  @options[:silent] = false
44
40
  opts.on( '-s', '--silent', "Do not output anything on the console" ) do
45
41
  @options[:silent] = true
@@ -74,22 +70,6 @@ module Mabmapper
74
70
  (puts optparse.help ; exit)
75
71
  end
76
72
 
77
- #
78
- # Load normalization engine
79
- #
80
- def load_engine!
81
- begin
82
- engine_file = "mabmapper/aleph_mab_xml_engine" # TODO: Make me configurable
83
- require engine_file
84
- engine_class_name = "#{engine_file}".classify
85
- @engine = engine_class_name.constantize.new
86
- log "#{engine_class_name} loaded!"
87
- rescue LoadError
88
- log "Error loading engine #{engine_file}."
89
- exit 1
90
- end
91
- end
92
-
93
73
  #
94
74
  # Process the input files
95
75
  #
@@ -119,6 +99,28 @@ module Mabmapper
119
99
 
120
100
  private
121
101
 
102
+ def hash_to_xml(hash)
103
+ builder = Nokogiri::XML::Builder.new do |xml|
104
+ xml.document do
105
+ hash.each_pair do |_key, _values|
106
+ if _values.present? || _values == false
107
+ if _values.is_a?(Array)
108
+ #xml.send("#{field.name.downcase.pluralize}_") do
109
+ _values.each do |_value|
110
+ xml.send("#{_key.downcase}_", _value)
111
+ end
112
+ #end
113
+ else
114
+ xml.send("#{_key.downcase}_", _values)
115
+ end
116
+ end
117
+ end
118
+ end
119
+ end
120
+
121
+ builder.to_xml
122
+ end
123
+
122
124
  def process_file(file)
123
125
  case
124
126
  when file.end_with?('.tar') then process_tar_file(file)
@@ -137,13 +139,11 @@ module Mabmapper
137
139
  tarReader.each do |entry|
138
140
  if entry.file?
139
141
  log "Processing file #{entry.full_name} from archive #{file}"
140
- result = @engine.process(entry.full_name, entry.read, archive: file)
142
+ result = @transformation.call(entry.read.force_encoding("utf-8"))
141
143
 
142
144
  writer.add_file(entry.full_name, 0644) do |f|
143
- f.write(result.to_xml)
145
+ f.write(hash_to_xml(result))
144
146
  end if writer
145
-
146
- log "Result for #{entry.full_name} from archive #{file}\n#{result.to_xml(@options[:debug_fields])}\n" if @options[:debug]
147
147
  end
148
148
  end
149
149
 
@@ -163,14 +163,12 @@ module Mabmapper
163
163
  tarReader.each do |entry|
164
164
  if entry.file?
165
165
  log "Processing file #{entry.full_name} from archive #{file}"
166
- result = @engine.process(entry.full_name, entry.read, archive: file)
166
+ result = @transformation.call(entry.read.force_encoding("utf-8"))
167
167
 
168
- xml_result = result.to_xml
168
+ xml_result = hash_to_xml(result)
169
169
  writer.add_file_simple(entry.full_name, 0644, xml_result.bytesize) do |f|
170
170
  f.write(xml_result)
171
171
  end if writer
172
-
173
- log "Result for #{entry.full_name} from archive #{file}\n#{result.to_xml(@options[:debug_fields])}\n" if @options[:debug]
174
172
  end
175
173
  end
176
174
  ensure
@@ -181,14 +179,12 @@ module Mabmapper
181
179
 
182
180
  def process_default_file(file)
183
181
  log "Processing file #{file}"
184
- result = @engine.process(file, File.open(file, "r").read)
182
+ result = @transformation.call(File.open(file, "r").read.force_encoding("utf-8"))
185
183
 
186
184
  if output_dir
187
185
  out_file = File.join(output_dir, File.basename(file))
188
- File.open(out_file, 'w') { |f| f.write(result.to_xml) }
186
+ File.open(out_file, 'w') { |f| f.write(hash_to_xml(result)) }
189
187
  end
190
-
191
- log "Result for #{file}\n#{result.to_xml(@options[:debug_fields])}\n" if @options[:debug]
192
188
  end
193
189
 
194
190
  def output_dir
@@ -210,6 +206,5 @@ module Mabmapper
210
206
  end
211
207
  end
212
208
  end
213
-
214
209
  end
215
210
  end
@@ -84,7 +84,7 @@ module Mabmapper
84
84
  @engine = eval("self", block.binding)
85
85
  end
86
86
 
87
- attr_reader :name, :result, :doc
87
+ attr_reader :name, :result, :doc, :source
88
88
 
89
89
  def ref(field_name)
90
90
  field = @engine.fields.find{ |f| f.name == field_name.to_s }
@@ -107,6 +107,7 @@ module Mabmapper
107
107
 
108
108
  def process(document)
109
109
  @doc = document
110
+ @source = document.source
110
111
  @result = instance_eval(&@proc)
111
112
  end
112
113
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ require 'metacrunch/mab2'
2
3
  require 'stringex'
3
4
 
4
5
  module Mabmapper
@@ -6,11 +7,12 @@ module Mabmapper
6
7
  class Document
7
8
  include QueryHelper
8
9
 
9
- attr_accessor :xml
10
+ attr_accessor :source, :xml
10
11
 
11
12
  def initialize(contents)
12
- @xml = Nokogiri::XML(contents)
13
- @xml.remove_namespaces!
13
+ @source = Metacrunch::Mab2::Document.from_aleph_mab_xml(contents)
14
+ #@xml = Nokogiri::XML(contents)
15
+ #@xml.remove_namespaces!
14
16
  end
15
17
 
16
18
  #
@@ -1,3 +1,3 @@
1
1
  module Mabmapper
2
- VERSION = "1.0.0.pre18"
2
+ VERSION = "2.0.0"
3
3
  end
data/lib/mabmapper.rb CHANGED
@@ -1,6 +1,7 @@
1
+ require 'active_support'
2
+ require 'active_support/core_ext'
1
3
  require 'rubygems/package'
2
4
  require 'optparse'
3
- require 'active_support/core_ext'
4
5
  require 'nokogiri'
5
6
 
6
7
  begin
data/mabmapper.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
  require File.expand_path('../lib/mabmapper/version', __FILE__)
3
3
 
4
4
  Gem::Specification.new do |gem|
5
- gem.authors = ["René Sprotte"]
5
+ gem.authors = ["René Sprotte", "Michael Sievers"]
6
6
  gem.email = ["r.sprotte@ub.uni-paderborn.de"]
7
7
  gem.description = %q{Mabmapper is a powerful and extendable processing engine to normalize any kind of
8
8
  input data into a simple intermediate format made of fields and values. It comes with a ready to use
@@ -21,14 +21,10 @@ Gem::Specification.new do |gem|
21
21
 
22
22
  gem.required_ruby_version = '>= 1.9.3'
23
23
 
24
- gem.add_dependency('nokogiri', '~> 1.6.0')
25
- gem.add_dependency('activesupport', '>= 4.0.0')
26
- gem.add_dependency('libxml-ruby', '~> 2.7.0')
27
- gem.add_dependency('oj', '~> 2.1.4')
28
- gem.add_dependency('stringex', '~> 2.1.0')
29
-
30
- gem.add_development_dependency('minitest', '~> 4.7.5')
31
- gem.add_development_dependency('pry', '0.9.12.2') # stuck to 0.9.12.2 due to repl color issue
32
- gem.add_development_dependency('pry-nav', '~> 0.2.3')
33
- gem.add_development_dependency('pry-syntax-hacks', '~> 0.0.6')
24
+ gem.add_dependency('nokogiri', '~> 1.6')
25
+ gem.add_dependency('activesupport', '>= 4.0')
26
+ gem.add_dependency('libxml-ruby', '~> 2.7')
27
+ gem.add_dependency('metacrunch-mab2')
28
+ gem.add_dependency('oj', '~> 2.1')
29
+ gem.add_dependency('stringex', '~> 2.1')
34
30
  end
@@ -0,0 +1,197 @@
1
+ fieldname = "publisher"
2
+
3
+ describe "field :#{fieldname}" do
4
+ # RAK
5
+ #
6
+ # f410 [NW]
7
+ # a Ort des 1. Verlagers [WDH]
8
+ #
9
+ # f412 [NW]
10
+ # a Name des 1. Verlegers [WDH]
11
+ #
12
+ # f415 [NW]
13
+ # a Ort des 2. Verlegers
14
+ #
15
+ # f417 [NW]
16
+ # a Name des 2. Verlegers
17
+ #
18
+ # RDA
19
+ #
20
+ # f419 [WDH]
21
+ # a Ort(e) (ggf. mehrere durch ; getrennt)
22
+ # b Verlagsname
23
+ # c Datumsangabe
24
+
25
+ context "one publisher, one place" do
26
+ context "for a RAK record" do
27
+ subject do
28
+ mab_xml = mab_xml_builder do
29
+ datafield("410", ind2: "1") { subfield("a", "First publisher place") }
30
+ datafield("412", ind2: "1") { subfield("a", "First publisher") }
31
+ end
32
+
33
+ transform(mab_xml)[fieldname]
34
+ end
35
+
36
+ it { is_expected.to eq(["First publisher place : First publisher"]) }
37
+ end
38
+
39
+ context "for a RDA record" do
40
+ subject do
41
+ mab_xml = mab_xml_builder do
42
+ datafield("419") do
43
+ subfield("a", "First publisher place")
44
+ subfield("b", "First publisher")
45
+ end
46
+ end
47
+
48
+ transform(mab_xml)[fieldname]
49
+ end
50
+
51
+ it { is_expected.to eq(["First publisher place : First publisher"]) }
52
+ end
53
+ end
54
+
55
+ context "one publisher, multiple places" do
56
+ context "for a RAK record" do
57
+ subject do
58
+ mab_xml = mab_xml_builder do
59
+ datafield("410", ind2: "1") { subfield("a", "First publisher place") }
60
+ datafield("410", ind2: "1") { subfield("a", "Another place") }
61
+ datafield("412", ind2: "1") { subfield("a", "First publisher") }
62
+
63
+ datafield("415", ind2: "1") { subfield("a", "Second publisher place") }
64
+ end
65
+
66
+ transform(mab_xml)[fieldname]
67
+ end
68
+
69
+ it { is_expected.to eq(["First publisher place : First publisher"]) }
70
+ end
71
+
72
+ context "for a RDA record" do
73
+ subject do
74
+ mab_xml = mab_xml_builder do
75
+ datafield("419") do
76
+ subfield("a", "First publisher place")
77
+ subfield("a", "First publisher second place")
78
+ subfield("b", "First publisher")
79
+ end
80
+
81
+ datafield("419") do
82
+ subfield("a", "Another place")
83
+ end
84
+ end
85
+
86
+ transform(mab_xml)[fieldname]
87
+ end
88
+
89
+ it { is_expected.to eq(["First publisher place : First publisher"]) }
90
+ end
91
+ end
92
+
93
+ context "multiple publishers, multiple places" do
94
+ context "for a RAK record" do
95
+ subject do
96
+ mab_xml = mab_xml_builder do
97
+ datafield("410", ind2: "1") { subfield("a", "First publisher place") }
98
+ datafield("412", ind2: "1") { subfield("a", "First publisher") }
99
+
100
+ datafield("415", ind2: "1") { subfield("a", "Second publisher place") }
101
+ datafield("417", ind2: "1") { subfield("a", "Second publisher") }
102
+ end
103
+
104
+ transform(mab_xml)[fieldname]
105
+ end
106
+
107
+ it { is_expected.to eq(["First publisher place : First publisher", "Second publisher place : Second publisher"]) }
108
+ end
109
+
110
+ context "for a RDA record" do
111
+ subject do
112
+ mab_xml = mab_xml_builder do
113
+ datafield("419") do
114
+ subfield("a", "First publisher place")
115
+ subfield("b", "First publisher")
116
+ end
117
+
118
+ datafield("419") do
119
+ subfield("a", "Second publisher place")
120
+ subfield("b", "Second publisher")
121
+ end
122
+ end
123
+
124
+ transform(mab_xml)[fieldname]
125
+ end
126
+
127
+ it { is_expected.to eq(["First publisher place : First publisher", "Second publisher place : Second publisher"]) }
128
+ end
129
+ end
130
+
131
+ context "multiple publishers, not all having an associated place" do
132
+ context "for a RAK record" do
133
+ subject do
134
+ mab_xml = mab_xml_builder do
135
+ datafield("410", ind2: "1") { subfield("a", "First publisher place") }
136
+ datafield("412", ind2: "1") { subfield("a", "First publisher") }
137
+
138
+ datafield("417", ind2: "1") { subfield("a", "Second publisher") }
139
+ end
140
+
141
+ transform(mab_xml)[fieldname]
142
+ end
143
+
144
+ it { is_expected.to eq(["First publisher place : First publisher", "Second publisher"]) }
145
+ end
146
+
147
+ context "for a RDA record" do
148
+ subject do
149
+ mab_xml = mab_xml_builder do
150
+ datafield("419") do
151
+ subfield("a", "First publisher place")
152
+ subfield("b", "First publisher")
153
+ end
154
+
155
+ datafield("419") do
156
+ subfield("b", "Second publisher")
157
+ end
158
+ end
159
+
160
+ transform(mab_xml)[fieldname]
161
+ end
162
+
163
+ it { is_expected.to eq(["First publisher place : First publisher", "Second publisher"]) }
164
+ end
165
+ end
166
+
167
+ context "multiple publishers, multiple places within repeated subfields (RAK)" do
168
+ subject do
169
+ mab_xml = mab_xml_builder do
170
+ datafield("410", ind2: "1") { subfield("a", "First publisher place") }
171
+ datafield("410", ind2: "1") { subfield("a", "Second publisher place") }
172
+
173
+ datafield("412", ind2: "1") { subfield("a", "First publisher") }
174
+ datafield("412", ind2: "1") { subfield("a", "Second publisher") }
175
+ end
176
+
177
+ transform(mab_xml)[fieldname]
178
+ end
179
+
180
+ it { is_expected.to eq(["First publisher place : First publisher", "Second publisher place : Second publisher"]) }
181
+ end
182
+
183
+ context "multiple places for a single publisher (RDA)" do
184
+ subject do
185
+ mab_xml = mab_xml_builder do
186
+ datafield("419") do
187
+ subfield("a", "First publisher first place ; First publisher second place")
188
+ subfield("b", "First publisher")
189
+ end
190
+ end
191
+
192
+ transform(mab_xml)[fieldname]
193
+ end
194
+
195
+ it { is_expected.to eq(["First publisher first place : First publisher"]) }
196
+ end
197
+ end
@@ -0,0 +1,86 @@
1
+ if ENV["CODECLIMATE_REPO_TOKEN"]
2
+ # report coverage only for latest mri ruby
3
+ if RUBY_ENGINE == "ruby" && RUBY_VERSION >= "2.2.0"
4
+ require "codeclimate-test-reporter"
5
+ CodeClimate::TestReporter.start
6
+ end
7
+ else
8
+ require "simplecov"
9
+ SimpleCov.start
10
+ end
11
+
12
+ require "mabmapper"
13
+ require "mabmapper/aleph_mab_xml_engine"
14
+ require "hashdiff"
15
+ require "nokogiri"
16
+ require "yaml"
17
+
18
+ begin
19
+ require "pry"
20
+ rescue LoadError
21
+ end
22
+
23
+ RSpec.configure do |config|
24
+ # begin --- rspec 3.1 generator
25
+ config.expect_with :rspec do |expectations|
26
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
27
+ end
28
+
29
+ config.mock_with :rspec do |mocks|
30
+ mocks.verify_partial_doubles = true
31
+ end
32
+ # end --- rspec 3.1 generator
33
+ end
34
+
35
+ def asset_dir
36
+ File.expand_path(File.join(File.dirname(__FILE__), "assets"))
37
+ end
38
+
39
+ def mab_xml_builder(identifier="aleph-publish:000000000", &block)
40
+ Nokogiri::XML::Builder.new do
41
+ send(
42
+ :"OAI-PMH",
43
+ "xmlns" => "http://www.openarchives.org/OAI/2.0/",
44
+ "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
45
+ "xsi:schemaLocation" => "http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"
46
+ ) do
47
+ ListRecords do
48
+ record do
49
+ header do |_xml|
50
+ _xml.identifier identifier
51
+ end
52
+ metadata do
53
+ record("xmlns" => "http://www.ddb.de/professionell/mabxml/mabxml-1.xsd") do
54
+ define_singleton_method(:controlfield) do |_tag, _text|
55
+ send(:method_missing, :controlfield, _text, tag: _tag)
56
+ end
57
+
58
+ define_singleton_method(:datafield) do |_tag, _attributes = {}, &_block|
59
+ send(:method_missing, :datafield, {tag: _tag}.merge(_attributes), &_block)
60
+ end
61
+
62
+ define_singleton_method(:subfield) do |_code, _text|
63
+ send(:method_missing, :subfield, _text, code: _code)
64
+ end
65
+
66
+ define_singleton_method(:journal!) do
67
+ controlfield("052", "p")
68
+ end
69
+
70
+ instance_eval(&block)
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+ .to_xml
78
+ end
79
+
80
+ def transform(mab_xml)
81
+ Mabmapper::AlephMabXmlEngine.new.process(nil, mab_xml).to_hash
82
+ end
83
+
84
+ def read_asset(path_to_file)
85
+ File.read(File.expand_path(File.join(asset_dir, path_to_file)))
86
+ end
@@ -8,12 +8,12 @@
8
8
  <metadata>
9
9
  <record xmlns="http://www.ddb.de/professionell/mabxml/mabxml-1.xsd">
10
10
  <datafield tag="405" ind2="1">
11
- <subfield code="a">XXX</subfield>
12
- <subfield code="p">YYY</subfield>
11
+ <subfield code="a">YYY</subfield>
12
+ <subfield code="p">XXX</subfield>
13
13
  </datafield>
14
14
  <datafield tag="405" ind2="2">
15
- <subfield code="a">XXX</subfield>
16
- <subfield code="p">ZZZ</subfield>
15
+ <subfield code="a">ZZZ</subfield>
16
+ <subfield code="p">XXX</subfield>
17
17
  </datafield>
18
18
  </record>
19
19
  </metadata>
@@ -8,20 +8,20 @@
8
8
  <metadata>
9
9
  <record xmlns="http://www.ddb.de/professionell/mabxml/mabxml-1.xsd">
10
10
  <datafield tag="501" ind2="1">
11
- <subfield code="a">AAA</subfield>
12
- <subfield code="p">BBB</subfield>
11
+ <subfield code="a">BBB</subfield>
12
+ <subfield code="p">AAA</subfield>
13
13
  </datafield>
14
14
  <datafield tag="501" ind2="2">
15
- <subfield code="a">XXX</subfield>
16
- <subfield code="p">ZZZ</subfield>
15
+ <subfield code="a">ZZZ</subfield>
16
+ <subfield code="p">XXX</subfield>
17
17
  </datafield>
18
18
  <datafield tag="519" ind2="1">
19
- <subfield code="a">XXX</subfield>
20
- <subfield code="p">YYY</subfield>
19
+ <subfield code="a">YYY</subfield>
20
+ <subfield code="p">XXX</subfield>
21
21
  </datafield>
22
22
  <datafield tag="519" ind2="2">
23
- <subfield code="a">YYY</subfield>
24
- <subfield code="p">ZZZ</subfield>
23
+ <subfield code="a">ZZZ</subfield>
24
+ <subfield code="p">YYY</subfield>
25
25
  </datafield>
26
26
  </record>
27
27
  </metadata>
@@ -8,12 +8,12 @@
8
8
  <metadata>
9
9
  <record xmlns="http://www.ddb.de/professionell/mabxml/mabxml-1.xsd">
10
10
  <datafield tag="522" ind2="1">
11
- <subfield code="a">XXX</subfield>
12
- <subfield code="p">YYY</subfield>
11
+ <subfield code="a">YYY</subfield>
12
+ <subfield code="p">XXX</subfield>
13
13
  </datafield>
14
14
  <datafield tag="522" ind2="2">
15
- <subfield code="a">XXX</subfield>
16
- <subfield code="p">ZZZ</subfield>
15
+ <subfield code="a">ZZZ</subfield>
16
+ <subfield code="p">XXX</subfield>
17
17
  </datafield>
18
18
  </record>
19
19
  </metadata>
@@ -8,12 +8,12 @@
8
8
  <metadata>
9
9
  <record xmlns="http://www.ddb.de/professionell/mabxml/mabxml-1.xsd">
10
10
  <datafield tag="523" ind2="1">
11
- <subfield code="a">XXX</subfield>
12
- <subfield code="p">YYY</subfield>
11
+ <subfield code="a">YYY</subfield>
12
+ <subfield code="p">XXX</subfield>
13
13
  </datafield>
14
14
  <datafield tag="523" ind2="2">
15
- <subfield code="a">XXX</subfield>
16
- <subfield code="p">ZZZ</subfield>
15
+ <subfield code="a">ZZZ</subfield>
16
+ <subfield code="p">XXX</subfield>
17
17
  </datafield>
18
18
  </record>
19
19
  </metadata>
@@ -8,20 +8,20 @@
8
8
  <metadata>
9
9
  <record xmlns="http://www.ddb.de/professionell/mabxml/mabxml-1.xsd">
10
10
  <datafield tag="536" ind2="1">
11
- <subfield code="a">AAA</subfield>
12
- <subfield code="p">BBB</subfield>
11
+ <subfield code="a">BBB</subfield>
12
+ <subfield code="p">AAA</subfield>
13
13
  </datafield>
14
14
  <datafield tag="536" ind2="2">
15
- <subfield code="a">XXX</subfield>
16
- <subfield code="p">ZZZ</subfield>
15
+ <subfield code="a">ZZZ</subfield>
16
+ <subfield code="p">XXX</subfield>
17
17
  </datafield>
18
18
  <datafield tag="537" ind2="1">
19
- <subfield code="a">XXX</subfield>
20
- <subfield code="p">YYY</subfield>
19
+ <subfield code="a">YYY</subfield>
20
+ <subfield code="p">XXX</subfield>
21
21
  </datafield>
22
22
  <datafield tag="537" ind2="2">
23
- <subfield code="a">YYY</subfield>
24
- <subfield code="p">ZZZ</subfield>
23
+ <subfield code="a">ZZZ</subfield>
24
+ <subfield code="p">YYY</subfield>
25
25
  </datafield>
26
26
  </record>
27
27
  </metadata>