mabmapper 1.0.0.pre18 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/mabmapper/cli.rb CHANGED
@@ -3,6 +3,7 @@
3
3
  #
4
4
  require 'mabmapper/elasticsearch_writer'
5
5
  require 'mabmapper/tar_writer'
6
+ require 'metacrunch/ubpb/transformations/mab_to_primo'
6
7
 
7
8
  module Mabmapper
8
9
  class Cli
@@ -12,7 +13,7 @@ module Mabmapper
12
13
  def initialize
13
14
  @options = {}
14
15
  parse_command_line!
15
- load_engine!
16
+ @transformation = Metacrunch::UBPB::Transformations::MabToPrimo.new
16
17
  process_files!
17
18
  end
18
19
 
@@ -35,11 +36,6 @@ module Mabmapper
35
36
  @options[:debug] = true
36
37
  end
37
38
 
38
- @options[:debug_fields] = []
39
- opts.on( '-f', '--debug-fields a,b,c', Array, "If debug mode is on only fields matching the given names will be debugged." ) do |fields|
40
- @options[:debug_fields] = fields
41
- end
42
-
43
39
  @options[:silent] = false
44
40
  opts.on( '-s', '--silent', "Do not output anything on the console" ) do
45
41
  @options[:silent] = true
@@ -74,22 +70,6 @@ module Mabmapper
74
70
  (puts optparse.help ; exit)
75
71
  end
76
72
 
77
- #
78
- # Load normalization engine
79
- #
80
- def load_engine!
81
- begin
82
- engine_file = "mabmapper/aleph_mab_xml_engine" # TODO: Make me configurable
83
- require engine_file
84
- engine_class_name = "#{engine_file}".classify
85
- @engine = engine_class_name.constantize.new
86
- log "#{engine_class_name} loaded!"
87
- rescue LoadError
88
- log "Error loading engine #{engine_file}."
89
- exit 1
90
- end
91
- end
92
-
93
73
  #
94
74
  # Process the input files
95
75
  #
@@ -119,6 +99,28 @@ module Mabmapper
119
99
 
120
100
  private
121
101
 
102
+ def hash_to_xml(hash)
103
+ builder = Nokogiri::XML::Builder.new do |xml|
104
+ xml.document do
105
+ hash.each_pair do |_key, _values|
106
+ if _values.present? || _values == false
107
+ if _values.is_a?(Array)
108
+ #xml.send("#{field.name.downcase.pluralize}_") do
109
+ _values.each do |_value|
110
+ xml.send("#{_key.downcase}_", _value)
111
+ end
112
+ #end
113
+ else
114
+ xml.send("#{_key.downcase}_", _values)
115
+ end
116
+ end
117
+ end
118
+ end
119
+ end
120
+
121
+ builder.to_xml
122
+ end
123
+
122
124
  def process_file(file)
123
125
  case
124
126
  when file.end_with?('.tar') then process_tar_file(file)
@@ -137,13 +139,11 @@ module Mabmapper
137
139
  tarReader.each do |entry|
138
140
  if entry.file?
139
141
  log "Processing file #{entry.full_name} from archive #{file}"
140
- result = @engine.process(entry.full_name, entry.read, archive: file)
142
+ result = @transformation.call(entry.read.force_encoding("utf-8"))
141
143
 
142
144
  writer.add_file(entry.full_name, 0644) do |f|
143
- f.write(result.to_xml)
145
+ f.write(hash_to_xml(result))
144
146
  end if writer
145
-
146
- log "Result for #{entry.full_name} from archive #{file}\n#{result.to_xml(@options[:debug_fields])}\n" if @options[:debug]
147
147
  end
148
148
  end
149
149
 
@@ -163,14 +163,12 @@ module Mabmapper
163
163
  tarReader.each do |entry|
164
164
  if entry.file?
165
165
  log "Processing file #{entry.full_name} from archive #{file}"
166
- result = @engine.process(entry.full_name, entry.read, archive: file)
166
+ result = @transformation.call(entry.read.force_encoding("utf-8"))
167
167
 
168
- xml_result = result.to_xml
168
+ xml_result = hash_to_xml(result)
169
169
  writer.add_file_simple(entry.full_name, 0644, xml_result.bytesize) do |f|
170
170
  f.write(xml_result)
171
171
  end if writer
172
-
173
- log "Result for #{entry.full_name} from archive #{file}\n#{result.to_xml(@options[:debug_fields])}\n" if @options[:debug]
174
172
  end
175
173
  end
176
174
  ensure
@@ -181,14 +179,12 @@ module Mabmapper
181
179
 
182
180
  def process_default_file(file)
183
181
  log "Processing file #{file}"
184
- result = @engine.process(file, File.open(file, "r").read)
182
+ result = @transformation.call(File.open(file, "r").read.force_encoding("utf-8"))
185
183
 
186
184
  if output_dir
187
185
  out_file = File.join(output_dir, File.basename(file))
188
- File.open(out_file, 'w') { |f| f.write(result.to_xml) }
186
+ File.open(out_file, 'w') { |f| f.write(hash_to_xml(result)) }
189
187
  end
190
-
191
- log "Result for #{file}\n#{result.to_xml(@options[:debug_fields])}\n" if @options[:debug]
192
188
  end
193
189
 
194
190
  def output_dir
@@ -210,6 +206,5 @@ module Mabmapper
210
206
  end
211
207
  end
212
208
  end
213
-
214
209
  end
215
210
  end
@@ -84,7 +84,7 @@ module Mabmapper
84
84
  @engine = eval("self", block.binding)
85
85
  end
86
86
 
87
- attr_reader :name, :result, :doc
87
+ attr_reader :name, :result, :doc, :source
88
88
 
89
89
  def ref(field_name)
90
90
  field = @engine.fields.find{ |f| f.name == field_name.to_s }
@@ -107,6 +107,7 @@ module Mabmapper
107
107
 
108
108
  def process(document)
109
109
  @doc = document
110
+ @source = document.source
110
111
  @result = instance_eval(&@proc)
111
112
  end
112
113
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ require 'metacrunch/mab2'
2
3
  require 'stringex'
3
4
 
4
5
  module Mabmapper
@@ -6,11 +7,12 @@ module Mabmapper
6
7
  class Document
7
8
  include QueryHelper
8
9
 
9
- attr_accessor :xml
10
+ attr_accessor :source, :xml
10
11
 
11
12
  def initialize(contents)
12
- @xml = Nokogiri::XML(contents)
13
- @xml.remove_namespaces!
13
+ @source = Metacrunch::Mab2::Document.from_aleph_mab_xml(contents)
14
+ #@xml = Nokogiri::XML(contents)
15
+ #@xml.remove_namespaces!
14
16
  end
15
17
 
16
18
  #
@@ -1,3 +1,3 @@
1
1
  module Mabmapper
2
- VERSION = "1.0.0.pre18"
2
+ VERSION = "2.0.0"
3
3
  end
data/lib/mabmapper.rb CHANGED
@@ -1,6 +1,7 @@
1
+ require 'active_support'
2
+ require 'active_support/core_ext'
1
3
  require 'rubygems/package'
2
4
  require 'optparse'
3
- require 'active_support/core_ext'
4
5
  require 'nokogiri'
5
6
 
6
7
  begin
data/mabmapper.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
  require File.expand_path('../lib/mabmapper/version', __FILE__)
3
3
 
4
4
  Gem::Specification.new do |gem|
5
- gem.authors = ["René Sprotte"]
5
+ gem.authors = ["René Sprotte", "Michael Sievers"]
6
6
  gem.email = ["r.sprotte@ub.uni-paderborn.de"]
7
7
  gem.description = %q{Mabmapper is a powerful and extendable processing engine to normalize any kind of
8
8
  input data into a simple intermediate format made of fields and values. It comes with a ready to use
@@ -21,14 +21,10 @@ Gem::Specification.new do |gem|
21
21
 
22
22
  gem.required_ruby_version = '>= 1.9.3'
23
23
 
24
- gem.add_dependency('nokogiri', '~> 1.6.0')
25
- gem.add_dependency('activesupport', '>= 4.0.0')
26
- gem.add_dependency('libxml-ruby', '~> 2.7.0')
27
- gem.add_dependency('oj', '~> 2.1.4')
28
- gem.add_dependency('stringex', '~> 2.1.0')
29
-
30
- gem.add_development_dependency('minitest', '~> 4.7.5')
31
- gem.add_development_dependency('pry', '0.9.12.2') # stuck to 0.9.12.2 due to repl color issue
32
- gem.add_development_dependency('pry-nav', '~> 0.2.3')
33
- gem.add_development_dependency('pry-syntax-hacks', '~> 0.0.6')
24
+ gem.add_dependency('nokogiri', '~> 1.6')
25
+ gem.add_dependency('activesupport', '>= 4.0')
26
+ gem.add_dependency('libxml-ruby', '~> 2.7')
27
+ gem.add_dependency('metacrunch-mab2')
28
+ gem.add_dependency('oj', '~> 2.1')
29
+ gem.add_dependency('stringex', '~> 2.1')
34
30
  end
@@ -0,0 +1,197 @@
1
+ fieldname = "publisher"
2
+
3
+ describe "field :#{fieldname}" do
4
+ # RAK
5
+ #
6
+ # f410 [NW]
7
+ # a Ort des 1. Verlagers [WDH]
8
+ #
9
+ # f412 [NW]
10
+ # a Name des 1. Verlegers [WDH]
11
+ #
12
+ # f415 [NW]
13
+ # a Ort des 2. Verlegers
14
+ #
15
+ # f417 [NW]
16
+ # a Name des 2. Verlegers
17
+ #
18
+ # RDA
19
+ #
20
+ # f419 [WDH]
21
+ # a Ort(e) (ggf. mehrere durch ; getrennt)
22
+ # b Verlagsname
23
+ # c Datumsangabe
24
+
25
+ context "one publisher, one place" do
26
+ context "for a RAK record" do
27
+ subject do
28
+ mab_xml = mab_xml_builder do
29
+ datafield("410", ind2: "1") { subfield("a", "First publisher place") }
30
+ datafield("412", ind2: "1") { subfield("a", "First publisher") }
31
+ end
32
+
33
+ transform(mab_xml)[fieldname]
34
+ end
35
+
36
+ it { is_expected.to eq(["First publisher place : First publisher"]) }
37
+ end
38
+
39
+ context "for a RDA record" do
40
+ subject do
41
+ mab_xml = mab_xml_builder do
42
+ datafield("419") do
43
+ subfield("a", "First publisher place")
44
+ subfield("b", "First publisher")
45
+ end
46
+ end
47
+
48
+ transform(mab_xml)[fieldname]
49
+ end
50
+
51
+ it { is_expected.to eq(["First publisher place : First publisher"]) }
52
+ end
53
+ end
54
+
55
+ context "one publisher, multiple places" do
56
+ context "for a RAK record" do
57
+ subject do
58
+ mab_xml = mab_xml_builder do
59
+ datafield("410", ind2: "1") { subfield("a", "First publisher place") }
60
+ datafield("410", ind2: "1") { subfield("a", "Another place") }
61
+ datafield("412", ind2: "1") { subfield("a", "First publisher") }
62
+
63
+ datafield("415", ind2: "1") { subfield("a", "Second publisher place") }
64
+ end
65
+
66
+ transform(mab_xml)[fieldname]
67
+ end
68
+
69
+ it { is_expected.to eq(["First publisher place : First publisher"]) }
70
+ end
71
+
72
+ context "for a RDA record" do
73
+ subject do
74
+ mab_xml = mab_xml_builder do
75
+ datafield("419") do
76
+ subfield("a", "First publisher place")
77
+ subfield("a", "First publisher second place")
78
+ subfield("b", "First publisher")
79
+ end
80
+
81
+ datafield("419") do
82
+ subfield("a", "Another place")
83
+ end
84
+ end
85
+
86
+ transform(mab_xml)[fieldname]
87
+ end
88
+
89
+ it { is_expected.to eq(["First publisher place : First publisher"]) }
90
+ end
91
+ end
92
+
93
+ context "multiple publishers, multiple places" do
94
+ context "for a RAK record" do
95
+ subject do
96
+ mab_xml = mab_xml_builder do
97
+ datafield("410", ind2: "1") { subfield("a", "First publisher place") }
98
+ datafield("412", ind2: "1") { subfield("a", "First publisher") }
99
+
100
+ datafield("415", ind2: "1") { subfield("a", "Second publisher place") }
101
+ datafield("417", ind2: "1") { subfield("a", "Second publisher") }
102
+ end
103
+
104
+ transform(mab_xml)[fieldname]
105
+ end
106
+
107
+ it { is_expected.to eq(["First publisher place : First publisher", "Second publisher place : Second publisher"]) }
108
+ end
109
+
110
+ context "for a RDA record" do
111
+ subject do
112
+ mab_xml = mab_xml_builder do
113
+ datafield("419") do
114
+ subfield("a", "First publisher place")
115
+ subfield("b", "First publisher")
116
+ end
117
+
118
+ datafield("419") do
119
+ subfield("a", "Second publisher place")
120
+ subfield("b", "Second publisher")
121
+ end
122
+ end
123
+
124
+ transform(mab_xml)[fieldname]
125
+ end
126
+
127
+ it { is_expected.to eq(["First publisher place : First publisher", "Second publisher place : Second publisher"]) }
128
+ end
129
+ end
130
+
131
+ context "multiple publishers, not all having an associated place" do
132
+ context "for a RAK record" do
133
+ subject do
134
+ mab_xml = mab_xml_builder do
135
+ datafield("410", ind2: "1") { subfield("a", "First publisher place") }
136
+ datafield("412", ind2: "1") { subfield("a", "First publisher") }
137
+
138
+ datafield("417", ind2: "1") { subfield("a", "Second publisher") }
139
+ end
140
+
141
+ transform(mab_xml)[fieldname]
142
+ end
143
+
144
+ it { is_expected.to eq(["First publisher place : First publisher", "Second publisher"]) }
145
+ end
146
+
147
+ context "for a RDA record" do
148
+ subject do
149
+ mab_xml = mab_xml_builder do
150
+ datafield("419") do
151
+ subfield("a", "First publisher place")
152
+ subfield("b", "First publisher")
153
+ end
154
+
155
+ datafield("419") do
156
+ subfield("b", "Second publisher")
157
+ end
158
+ end
159
+
160
+ transform(mab_xml)[fieldname]
161
+ end
162
+
163
+ it { is_expected.to eq(["First publisher place : First publisher", "Second publisher"]) }
164
+ end
165
+ end
166
+
167
+ context "multiple publishers, multiple places within repeated subfields (RAK)" do
168
+ subject do
169
+ mab_xml = mab_xml_builder do
170
+ datafield("410", ind2: "1") { subfield("a", "First publisher place") }
171
+ datafield("410", ind2: "1") { subfield("a", "Second publisher place") }
172
+
173
+ datafield("412", ind2: "1") { subfield("a", "First publisher") }
174
+ datafield("412", ind2: "1") { subfield("a", "Second publisher") }
175
+ end
176
+
177
+ transform(mab_xml)[fieldname]
178
+ end
179
+
180
+ it { is_expected.to eq(["First publisher place : First publisher", "Second publisher place : Second publisher"]) }
181
+ end
182
+
183
+ context "multiple places for a single publisher (RDA)" do
184
+ subject do
185
+ mab_xml = mab_xml_builder do
186
+ datafield("419") do
187
+ subfield("a", "First publisher first place ; First publisher second place")
188
+ subfield("b", "First publisher")
189
+ end
190
+ end
191
+
192
+ transform(mab_xml)[fieldname]
193
+ end
194
+
195
+ it { is_expected.to eq(["First publisher first place : First publisher"]) }
196
+ end
197
+ end
@@ -0,0 +1,86 @@
1
+ if ENV["CODECLIMATE_REPO_TOKEN"]
2
+ # report coverage only for latest mri ruby
3
+ if RUBY_ENGINE == "ruby" && RUBY_VERSION >= "2.2.0"
4
+ require "codeclimate-test-reporter"
5
+ CodeClimate::TestReporter.start
6
+ end
7
+ else
8
+ require "simplecov"
9
+ SimpleCov.start
10
+ end
11
+
12
+ require "mabmapper"
13
+ require "mabmapper/aleph_mab_xml_engine"
14
+ require "hashdiff"
15
+ require "nokogiri"
16
+ require "yaml"
17
+
18
+ begin
19
+ require "pry"
20
+ rescue LoadError
21
+ end
22
+
23
+ RSpec.configure do |config|
24
+ # begin --- rspec 3.1 generator
25
+ config.expect_with :rspec do |expectations|
26
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
27
+ end
28
+
29
+ config.mock_with :rspec do |mocks|
30
+ mocks.verify_partial_doubles = true
31
+ end
32
+ # end --- rspec 3.1 generator
33
+ end
34
+
35
+ def asset_dir
36
+ File.expand_path(File.join(File.dirname(__FILE__), "assets"))
37
+ end
38
+
39
+ def mab_xml_builder(identifier="aleph-publish:000000000", &block)
40
+ Nokogiri::XML::Builder.new do
41
+ send(
42
+ :"OAI-PMH",
43
+ "xmlns" => "http://www.openarchives.org/OAI/2.0/",
44
+ "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
45
+ "xsi:schemaLocation" => "http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"
46
+ ) do
47
+ ListRecords do
48
+ record do
49
+ header do |_xml|
50
+ _xml.identifier identifier
51
+ end
52
+ metadata do
53
+ record("xmlns" => "http://www.ddb.de/professionell/mabxml/mabxml-1.xsd") do
54
+ define_singleton_method(:controlfield) do |_tag, _text|
55
+ send(:method_missing, :controlfield, _text, tag: _tag)
56
+ end
57
+
58
+ define_singleton_method(:datafield) do |_tag, _attributes = {}, &_block|
59
+ send(:method_missing, :datafield, {tag: _tag}.merge(_attributes), &_block)
60
+ end
61
+
62
+ define_singleton_method(:subfield) do |_code, _text|
63
+ send(:method_missing, :subfield, _text, code: _code)
64
+ end
65
+
66
+ define_singleton_method(:journal!) do
67
+ controlfield("052", "p")
68
+ end
69
+
70
+ instance_eval(&block)
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+ .to_xml
78
+ end
79
+
80
+ def transform(mab_xml)
81
+ Mabmapper::AlephMabXmlEngine.new.process(nil, mab_xml).to_hash
82
+ end
83
+
84
+ def read_asset(path_to_file)
85
+ File.read(File.expand_path(File.join(asset_dir, path_to_file)))
86
+ end
@@ -8,12 +8,12 @@
8
8
  <metadata>
9
9
  <record xmlns="http://www.ddb.de/professionell/mabxml/mabxml-1.xsd">
10
10
  <datafield tag="405" ind2="1">
11
- <subfield code="a">XXX</subfield>
12
- <subfield code="p">YYY</subfield>
11
+ <subfield code="a">YYY</subfield>
12
+ <subfield code="p">XXX</subfield>
13
13
  </datafield>
14
14
  <datafield tag="405" ind2="2">
15
- <subfield code="a">XXX</subfield>
16
- <subfield code="p">ZZZ</subfield>
15
+ <subfield code="a">ZZZ</subfield>
16
+ <subfield code="p">XXX</subfield>
17
17
  </datafield>
18
18
  </record>
19
19
  </metadata>
@@ -8,20 +8,20 @@
8
8
  <metadata>
9
9
  <record xmlns="http://www.ddb.de/professionell/mabxml/mabxml-1.xsd">
10
10
  <datafield tag="501" ind2="1">
11
- <subfield code="a">AAA</subfield>
12
- <subfield code="p">BBB</subfield>
11
+ <subfield code="a">BBB</subfield>
12
+ <subfield code="p">AAA</subfield>
13
13
  </datafield>
14
14
  <datafield tag="501" ind2="2">
15
- <subfield code="a">XXX</subfield>
16
- <subfield code="p">ZZZ</subfield>
15
+ <subfield code="a">ZZZ</subfield>
16
+ <subfield code="p">XXX</subfield>
17
17
  </datafield>
18
18
  <datafield tag="519" ind2="1">
19
- <subfield code="a">XXX</subfield>
20
- <subfield code="p">YYY</subfield>
19
+ <subfield code="a">YYY</subfield>
20
+ <subfield code="p">XXX</subfield>
21
21
  </datafield>
22
22
  <datafield tag="519" ind2="2">
23
- <subfield code="a">YYY</subfield>
24
- <subfield code="p">ZZZ</subfield>
23
+ <subfield code="a">ZZZ</subfield>
24
+ <subfield code="p">YYY</subfield>
25
25
  </datafield>
26
26
  </record>
27
27
  </metadata>
@@ -8,12 +8,12 @@
8
8
  <metadata>
9
9
  <record xmlns="http://www.ddb.de/professionell/mabxml/mabxml-1.xsd">
10
10
  <datafield tag="522" ind2="1">
11
- <subfield code="a">XXX</subfield>
12
- <subfield code="p">YYY</subfield>
11
+ <subfield code="a">YYY</subfield>
12
+ <subfield code="p">XXX</subfield>
13
13
  </datafield>
14
14
  <datafield tag="522" ind2="2">
15
- <subfield code="a">XXX</subfield>
16
- <subfield code="p">ZZZ</subfield>
15
+ <subfield code="a">ZZZ</subfield>
16
+ <subfield code="p">XXX</subfield>
17
17
  </datafield>
18
18
  </record>
19
19
  </metadata>
@@ -8,12 +8,12 @@
8
8
  <metadata>
9
9
  <record xmlns="http://www.ddb.de/professionell/mabxml/mabxml-1.xsd">
10
10
  <datafield tag="523" ind2="1">
11
- <subfield code="a">XXX</subfield>
12
- <subfield code="p">YYY</subfield>
11
+ <subfield code="a">YYY</subfield>
12
+ <subfield code="p">XXX</subfield>
13
13
  </datafield>
14
14
  <datafield tag="523" ind2="2">
15
- <subfield code="a">XXX</subfield>
16
- <subfield code="p">ZZZ</subfield>
15
+ <subfield code="a">ZZZ</subfield>
16
+ <subfield code="p">XXX</subfield>
17
17
  </datafield>
18
18
  </record>
19
19
  </metadata>
@@ -8,20 +8,20 @@
8
8
  <metadata>
9
9
  <record xmlns="http://www.ddb.de/professionell/mabxml/mabxml-1.xsd">
10
10
  <datafield tag="536" ind2="1">
11
- <subfield code="a">AAA</subfield>
12
- <subfield code="p">BBB</subfield>
11
+ <subfield code="a">BBB</subfield>
12
+ <subfield code="p">AAA</subfield>
13
13
  </datafield>
14
14
  <datafield tag="536" ind2="2">
15
- <subfield code="a">XXX</subfield>
16
- <subfield code="p">ZZZ</subfield>
15
+ <subfield code="a">ZZZ</subfield>
16
+ <subfield code="p">XXX</subfield>
17
17
  </datafield>
18
18
  <datafield tag="537" ind2="1">
19
- <subfield code="a">XXX</subfield>
20
- <subfield code="p">YYY</subfield>
19
+ <subfield code="a">YYY</subfield>
20
+ <subfield code="p">XXX</subfield>
21
21
  </datafield>
22
22
  <datafield tag="537" ind2="2">
23
- <subfield code="a">YYY</subfield>
24
- <subfield code="p">ZZZ</subfield>
23
+ <subfield code="a">ZZZ</subfield>
24
+ <subfield code="p">YYY</subfield>
25
25
  </datafield>
26
26
  </record>
27
27
  </metadata>