mabmapper 1.0.0.pre15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/.travis.yml +4 -0
  4. data/Gemfile +7 -0
  5. data/LICENSE +22 -0
  6. data/README.md +49 -0
  7. data/Rakefile +29 -0
  8. data/bin/mabmapper +3 -0
  9. data/lib/mabmapper/aleph_mab_xml_engine.rb +1050 -0
  10. data/lib/mabmapper/cli.rb +216 -0
  11. data/lib/mabmapper/elasticsearch_writer.rb +52 -0
  12. data/lib/mabmapper/engine.rb +112 -0
  13. data/lib/mabmapper/mab_xml/document.rb +53 -0
  14. data/lib/mabmapper/mab_xml/field.rb +43 -0
  15. data/lib/mabmapper/mab_xml/query.rb +25 -0
  16. data/lib/mabmapper/mab_xml/query_helper.rb +101 -0
  17. data/lib/mabmapper/mab_xml/result_set.rb +34 -0
  18. data/lib/mabmapper/mab_xml/subfield.rb +12 -0
  19. data/lib/mabmapper/mab_xml.rb +6 -0
  20. data/lib/mabmapper/tar_writer.rb +29 -0
  21. data/lib/mabmapper/version.rb +3 -0
  22. data/lib/mabmapper.rb +11 -0
  23. data/mabmapper.gemspec +33 -0
  24. data/test/mab_files/test_creation_date/test1.xml +17 -0
  25. data/test/mab_files/test_creation_date/test2.xml +17 -0
  26. data/test/mab_files/test_creationdate/425_a_1.xml +17 -0
  27. data/test/mab_files/test_creationdate/425_a_2.xml +19 -0
  28. data/test/mab_files/test_creationdate/425_bc_1.xml +19 -0
  29. data/test/mab_files/test_creationdate/425_bc_2.xml +22 -0
  30. data/test/mab_files/test_creationdate/425_bc_3.xml +22 -0
  31. data/test/mab_files/test_creationdate/425_bc_4.xml +19 -0
  32. data/test/mab_files/test_creationdate/425_p_1.xml +19 -0
  33. data/test/mab_files/test_creationdate/425_p_2.xml +17 -0
  34. data/test/mab_files/test_creationdate/595_1.xml +20 -0
  35. data/test/mab_files/test_creator_contributor_facet/PAD01.001006945.PRIMO.xml +574 -0
  36. data/test/mab_files/test_description/405.xml +22 -0
  37. data/test/mab_files/test_description/501-519.xml +30 -0
  38. data/test/mab_files/test_description/522.xml +22 -0
  39. data/test/mab_files/test_description/523.xml +22 -0
  40. data/test/mab_files/test_description/536-537.xml +30 -0
  41. data/test/mab_files/test_doc/PAD01.001510737.PRIMO.xml +317 -0
  42. data/test/mab_files/test_edition/PAD01.000844686.PRIMO.xml +584 -0
  43. data/test/mab_files/test_edition/PAD01.000969531.PRIMO.xml +129 -0
  44. data/test/mab_files/test_edition/PAD01.000969710.PRIMO.xml +144 -0
  45. data/test/mab_files/test_edition/PAD01.000978033.PRIMO.xml +163 -0
  46. data/test/mab_files/test_edition/PAD01.000990520.PRIMO.xml +163 -0
  47. data/test/mab_files/test_erscheinungsform/PAD01.000870753.PRIMO.xml +256 -0
  48. data/test/mab_files/test_erscheinungsform/PAD01.000870755.PRIMO.xml +467 -0
  49. data/test/mab_files/test_ht_number/PAD01.001015067.PRIMO.xml +137 -0
  50. data/test/mab_files/test_inhaltstyp/PAD01.000870753.PRIMO.xml +256 -0
  51. data/test/mab_files/test_inhaltstyp/PAD01.000870755.PRIMO.xml +467 -0
  52. data/test/mab_files/test_is_secondary_form/PAD01.000806191.PRIMO.xml +216 -0
  53. data/test/mab_files/test_is_secondary_form/PAD01.000844686.PRIMO.xml +584 -0
  54. data/test/mab_files/test_is_secondary_form/PAD01.001015067.PRIMO.xml +137 -0
  55. data/test/mab_files/test_is_secondary_form/PAD01.001452439.PRIMO.xml +377 -0
  56. data/test/mab_files/test_is_suborder/PAD01.000806191.PRIMO.xml +216 -0
  57. data/test/mab_files/test_is_suborder/PAD01.000844686.PRIMO.xml +584 -0
  58. data/test/mab_files/test_is_suborder/PAD01.001452439.PRIMO.xml +377 -0
  59. data/test/mab_files/test_issn/PAD01.000637121.PRIMO.xml +805 -0
  60. data/test/mab_files/test_materialtyp/PAD01.000870753.PRIMO.xml +256 -0
  61. data/test/mab_files/test_materialtyp/PAD01.000870755.PRIMO.xml +467 -0
  62. data/test/mab_files/test_notation_sort/PAD01.000970649.PRIMO.xml +306 -0
  63. data/test/mab_files/test_notation_sort/PAD01.001006944.PRIMO.xml +279 -0
  64. data/test/mab_files/test_publisher/PAD01.000312406.PRIMO.xml +1043 -0
  65. data/test/mab_files/test_redactional_remark/PAD01.001510737.PRIMO.xml +317 -0
  66. data/test/mab_files/test_relation/PAD01.000438377.PRIMO.xml +232 -0
  67. data/test/mab_files/test_relation/PAD01.000637121.PRIMO.xml +810 -0
  68. data/test/mab_files/test_relation/PAD01.000806191.PRIMO.xml +216 -0
  69. data/test/mab_files/test_relation/PAD01.000844686.PRIMO.xml +584 -0
  70. data/test/mab_files/test_relation/PAD01.001452439.PRIMO.xml +377 -0
  71. data/test/mab_files/test_secondary_form_creationdate/PAD01.000806191.PRIMO.xml +216 -0
  72. data/test/mab_files/test_secondary_form_creationdate/PAD01.000844686.PRIMO.xml +584 -0
  73. data/test/mab_files/test_secondary_form_creationdate/PAD01.001452439.PRIMO.xml +377 -0
  74. data/test/mab_files/test_secondary_form_isbn/PAD01.000806191.PRIMO.xml +216 -0
  75. data/test/mab_files/test_secondary_form_isbn/PAD01.000844686.PRIMO.xml +584 -0
  76. data/test/mab_files/test_secondary_form_isbn/PAD01.001452439.PRIMO.xml +377 -0
  77. data/test/mab_files/test_secondary_form_physical_description/PAD01.000806191.PRIMO.xml +216 -0
  78. data/test/mab_files/test_secondary_form_physical_description/PAD01.001452439.PRIMO.xml +377 -0
  79. data/test/mab_files/test_secondary_form_preliminary_phrase/PAD01.000806191.PRIMO.xml +216 -0
  80. data/test/mab_files/test_secondary_form_preliminary_phrase/PAD01.001452439.PRIMO.xml +377 -0
  81. data/test/mab_files/test_secondary_form_publisher/PAD01.000806191.PRIMO.xml +216 -0
  82. data/test/mab_files/test_secondary_form_publisher/PAD01.001452439.PRIMO.xml +377 -0
  83. data/test/mab_files/test_secondary_form_superorder/PAD01.000806191.PRIMO.xml +216 -0
  84. data/test/mab_files/test_secondary_form_superorder/PAD01.000977734.PRIMO.xml +225 -0
  85. data/test/mab_files/test_secondary_form_superorder/PAD01.001452439.PRIMO.xml +377 -0
  86. data/test/mab_files/test_short_title_display/PAD01.000057960.PRIMO.xml +1069 -0
  87. data/test/mab_files/test_short_title_display/PAD01.000058000.PRIMO.xml +995 -0
  88. data/test/mab_files/test_short_title_display/PAD01.000215104.PRIMO.xml +191 -0
  89. data/test/mab_files/test_short_title_display/PAD01.000310864.PRIMO.xml +999 -0
  90. data/test/mab_files/test_short_title_display/PAD01.000392641.PRIMO.xml +4334 -0
  91. data/test/mab_files/test_short_title_display/PAD01.000392645.PRIMO.xml +4094 -0
  92. data/test/mab_files/test_short_title_display/PAD01.000438377.PRIMO.xml +232 -0
  93. data/test/mab_files/test_short_title_display/PAD01.000479391.PRIMO.xml +142 -0
  94. data/test/mab_files/test_short_title_display/PAD01.000637121.PRIMO.xml +805 -0
  95. data/test/mab_files/test_short_title_display/PAD01.000676616.PRIMO.xml +128 -0
  96. data/test/mab_files/test_short_title_display/PAD01.000782994.PRIMO.xml +169 -0
  97. data/test/mab_files/test_short_title_display/PAD01.001006945.PRIMO.xml +574 -0
  98. data/test/mab_files/test_short_title_display/PAD01.001015067.PRIMO.xml +137 -0
  99. data/test/mab_files/test_short_title_display/PAD01.001015070.PRIMO.xml +212 -0
  100. data/test/mab_files/test_short_title_display/PAD01.001108212.PRIMO.xml +259 -0
  101. data/test/mab_files/test_short_title_display/PAD01.001249043.PRIMO.xml +172 -0
  102. data/test/mab_files/test_short_title_display/PAD01.001499877.PRIMO.xml +227 -0
  103. data/test/mab_files/test_short_title_display/PAD01.001499879.PRIMO.xml +255 -0
  104. data/test/mab_files/test_short_title_display/PAD01.001499880.PRIMO.xml +279 -0
  105. data/test/mab_files/test_short_title_display/PAD01.001510878.PRIMO.xml +184 -0
  106. data/test/mab_files/test_short_title_display/PAD01.001562173.PRIMO.xml +116 -0
  107. data/test/mab_files/test_short_title_display/PAD01.001568334.PRIMO.xml +1840 -0
  108. data/test/mab_files/test_short_title_display/PAD01.001572048.PRIMO.xml +68 -0
  109. data/test/mab_files/test_short_title_display/PAD01.001572049.PRIMO.xml +133 -0
  110. data/test/mab_files/test_signature/PAD01.000161445.PRIMO.xml +149 -0
  111. data/test/mab_files/test_signature/PAD01.000321365.PRIMO.xml +343 -0
  112. data/test/mab_files/test_signature/PAD01.000636652.PRIMO.xml +217 -0
  113. data/test/mab_files/test_signature/PAD01.000857994.PRIMO.xml +187 -0
  114. data/test/mab_files/test_signature/PAD01.000859176.PRIMO.xml +559 -0
  115. data/test/mab_files/test_signature/PAD01.000969442.PRIMO.xml +210 -0
  116. data/test/mab_files/test_signature/PAD01.001006945.PRIMO.xml +574 -0
  117. data/test/mab_files/test_signature_search/PAD01.000161445.PRIMO.xml +149 -0
  118. data/test/mab_files/test_signature_search/PAD01.000321365.PRIMO.xml +343 -0
  119. data/test/mab_files/test_signature_search/PAD01.000636652.PRIMO.xml +217 -0
  120. data/test/mab_files/test_signature_search/PAD01.000857994.PRIMO.xml +187 -0
  121. data/test/mab_files/test_signature_search/PAD01.000859176.PRIMO.xml +559 -0
  122. data/test/mab_files/test_signature_search/PAD01.000969442.PRIMO.xml +210 -0
  123. data/test/mab_files/test_signature_search/PAD01.001006945.PRIMO.xml +574 -0
  124. data/test/mab_files/test_status/PAD01.000321365.PRIMO.xml +343 -0
  125. data/test/mab_files/test_status/PAD01.000392641.PRIMO.xml +4337 -0
  126. data/test/mab_files/test_status/detmold_1.xml +17 -0
  127. data/test/mab_files/test_status/detmold_2.xml +17 -0
  128. data/test/mab_files/test_status/detmold_3.xml +12 -0
  129. data/test/mab_files/test_subject/PAD01.000972511.PRIMO.xml +406 -0
  130. data/test/mab_files/test_suborders/PAD01.000057960.PRIMO.xml +1069 -0
  131. data/test/mab_files/test_suborders/PAD01.000058000.PRIMO.xml +995 -0
  132. data/test/mab_files/test_suborders/PAD01.000215104.PRIMO.xml +191 -0
  133. data/test/mab_files/test_suborders/PAD01.000310864.PRIMO.xml +999 -0
  134. data/test/mab_files/test_suborders/PAD01.000392641.PRIMO.xml +4334 -0
  135. data/test/mab_files/test_suborders/PAD01.000392645.PRIMO.xml +4094 -0
  136. data/test/mab_files/test_suborders/PAD01.000438377.PRIMO.xml +232 -0
  137. data/test/mab_files/test_suborders/PAD01.000479391.PRIMO.xml +142 -0
  138. data/test/mab_files/test_suborders/PAD01.000637121.PRIMO.xml +805 -0
  139. data/test/mab_files/test_suborders/PAD01.000676616.PRIMO.xml +128 -0
  140. data/test/mab_files/test_suborders/PAD01.001006945.PRIMO.xml +574 -0
  141. data/test/mab_files/test_suborders/PAD01.001015067.PRIMO.xml +137 -0
  142. data/test/mab_files/test_suborders/PAD01.001015068.PRIMO.xml +216 -0
  143. data/test/mab_files/test_suborders/PAD01.001015070.PRIMO.xml +212 -0
  144. data/test/mab_files/test_suborders/PAD01.001108212.PRIMO.xml +259 -0
  145. data/test/mab_files/test_suborders/PAD01.001499877.PRIMO.xml +227 -0
  146. data/test/mab_files/test_suborders/PAD01.001499879.PRIMO.xml +255 -0
  147. data/test/mab_files/test_suborders/PAD01.001499880.PRIMO.xml +279 -0
  148. data/test/mab_files/test_suborders/PAD01.001562173.PRIMO.xml +116 -0
  149. data/test/mab_files/test_suborders/PAD01.001572048.PRIMO.xml +68 -0
  150. data/test/mab_files/test_suborders/PAD01.001572049.PRIMO.xml +133 -0
  151. data/test/mab_files/test_superorder/PAD01.000806191.PRIMO.xml +216 -0
  152. data/test/mab_files/test_superorder/PAD01.000844686.PRIMO.xml +584 -0
  153. data/test/mab_files/test_superorder/PAD01.001015067.PRIMO.xml +137 -0
  154. data/test/mab_files/test_superorder/PAD01.001452439.PRIMO.xml +377 -0
  155. data/test/mab_files/test_superorder_display/PAD01.000000872.PRIMO.xml +227 -0
  156. data/test/mab_files/test_superorder_display/PAD01.000160412.PRIMO.xml +518 -0
  157. data/test/mab_files/test_superorder_display/PAD01.000162669.PRIMO.xml +198 -0
  158. data/test/mab_files/test_superorder_display/PAD01.000178500.PRIMO.xml +158 -0
  159. data/test/mab_files/test_superorder_display/PAD01.000297043.PRIMO.xml +154 -0
  160. data/test/mab_files/test_superorder_display/PAD01.000562878.PRIMO.xml +1214 -0
  161. data/test/mab_files/test_superorder_display/PAD01.000958473.PRIMO.xml +379 -0
  162. data/test/mab_files/test_superorder_display/PAD01.001006945.PRIMO.xml +574 -0
  163. data/test/mab_files/test_superorders/PAD01.000057960.PRIMO.xml +1069 -0
  164. data/test/mab_files/test_superorders/PAD01.000215104.PRIMO.xml +191 -0
  165. data/test/mab_files/test_superorders/PAD01.000310864.PRIMO.xml +999 -0
  166. data/test/mab_files/test_superorders/PAD01.000392641.PRIMO.xml +4334 -0
  167. data/test/mab_files/test_superorders/PAD01.000438377.PRIMO.xml +232 -0
  168. data/test/mab_files/test_superorders/PAD01.000479391.PRIMO.xml +142 -0
  169. data/test/mab_files/test_superorders/PAD01.000637121.PRIMO.xml +805 -0
  170. data/test/mab_files/test_superorders/PAD01.001015067.PRIMO.xml +137 -0
  171. data/test/mab_files/test_superorders/PAD01.001499877.PRIMO.xml +227 -0
  172. data/test/mab_files/test_superorders/PAD01.001572048.PRIMO.xml +68 -0
  173. data/test/mab_files/test_title_display/PAD01.000954111.PRIMO.xml +162 -0
  174. data/test/mab_files/test_title_display/PAD01.000992332.PRIMO.xml +189 -0
  175. data/test/mab_files/test_title_display/PAD01.001015068.PRIMO.xml +216 -0
  176. data/test/mab_files/test_title_display/PAD01.001499879.PRIMO.xml +255 -0
  177. data/test/mab_files/test_title_search/test_1.xml +20 -0
  178. data/test/mab_files/test_title_sort/PAD01.000954111.PRIMO.xml +162 -0
  179. data/test/mab_files/test_title_sort/PAD01.000992332.PRIMO.xml +189 -0
  180. data/test/mab_files/test_volume_count_sort/PAD01.001015068.PRIMO.xml +216 -0
  181. data/test/mab_files/test_volume_count_sort/PAD01.001499879.PRIMO.xml +255 -0
  182. data/test/mabmapper/test_creation_date.rb +5 -0
  183. data/test/mabmapper/test_creationdate.rb +23 -0
  184. data/test/mabmapper/test_creator_contributor_facet.rb +4 -0
  185. data/test/mabmapper/test_description.rb +9 -0
  186. data/test/mabmapper/test_doc.rb +6 -0
  187. data/test/mabmapper/test_edition.rb +8 -0
  188. data/test/mabmapper/test_erscheinungsform.rb +5 -0
  189. data/test/mabmapper/test_ht_number.rb +4 -0
  190. data/test/mabmapper/test_inhaltstyp.rb +5 -0
  191. data/test/mabmapper/test_is_secondary_form.rb +7 -0
  192. data/test/mabmapper/test_is_suborder.rb +7 -0
  193. data/test/mabmapper/test_issn.rb +4 -0
  194. data/test/mabmapper/test_materialtyp.rb +5 -0
  195. data/test/mabmapper/test_notation_sort.rb +5 -0
  196. data/test/mabmapper/test_publisher.rb +5 -0
  197. data/test/mabmapper/test_redactional_remark.rb +4 -0
  198. data/test/mabmapper/test_relation.rb +16 -0
  199. data/test/mabmapper/test_secondary_form_creationdate.rb +6 -0
  200. data/test/mabmapper/test_secondary_form_isbn.rb +6 -0
  201. data/test/mabmapper/test_secondary_form_physical_description.rb +5 -0
  202. data/test/mabmapper/test_secondary_form_preliminary_phrase.rb +5 -0
  203. data/test/mabmapper/test_secondary_form_publisher.rb +5 -0
  204. data/test/mabmapper/test_secondary_form_superorder.rb +9 -0
  205. data/test/mabmapper/test_short_title_display.rb +27 -0
  206. data/test/mabmapper/test_signature.rb +12 -0
  207. data/test/mabmapper/test_signature_search.rb +12 -0
  208. data/test/mabmapper/test_status.rb +13 -0
  209. data/test/mabmapper/test_subject.rb +5 -0
  210. data/test/mabmapper/test_suborders.rb +192 -0
  211. data/test/mabmapper/test_superorder.rb +7 -0
  212. data/test/mabmapper/test_superorder_display.rb +22 -0
  213. data/test/mabmapper/test_superorders.rb +38 -0
  214. data/test/mabmapper/test_title_display.rb +12 -0
  215. data/test/mabmapper/test_title_search.rb +4 -0
  216. data/test/mabmapper/test_title_sort.rb +6 -0
  217. data/test/mabmapper/test_volume_count_sort.rb +5 -0
  218. data/test/test_helper.rb +53 -0
  219. data/test/test_mabmapper.rb +19 -0
  220. data/utils/mab_by_docid.sh +19 -0
  221. metadata +574 -0
@@ -0,0 +1,216 @@
1
+ #
2
+ # The command line interface class
3
+ #
4
+ require 'rubygems/test_utilities'
5
+ require 'mabmapper/elasticsearch_writer'
6
+ require 'mabmapper/tar_writer'
7
+
8
+ module Mabmapper
9
+ class Cli
10
+
11
+ ROOT_DIR = Dir.pwd
12
+
13
+ def initialize
14
+ @options = {}
15
+ parse_command_line!
16
+ load_engine!
17
+ process_files!
18
+ end
19
+
20
+ protected
21
+
22
+ #
23
+ # Parse command line options
24
+ #
25
+ def parse_command_line!
26
+ optparse = OptionParser.new do |opts|
27
+ opts.banner = "Usage: mabmapper [options] FILES"
28
+
29
+ @options[:output_dir] = nil
30
+ opts.on( '-o', '--output DIR', 'Output directory' ) do |dir|
31
+ @options[:output_dir] = dir
32
+ end
33
+
34
+ @options[:debug] = false
35
+ opts.on( '-d', '--debug', "Debug mode on." ) do
36
+ @options[:debug] = true
37
+ end
38
+
39
+ @options[:debug_fields] = []
40
+ opts.on( '-f', '--debug-fields a,b,c', Array, "If debug mode is on only fields matching the given names will be debugged." ) do |fields|
41
+ @options[:debug_fields] = fields
42
+ end
43
+
44
+ @options[:silent] = false
45
+ opts.on( '-s', '--silent', "Do not output anything on the console" ) do
46
+ @options[:silent] = true
47
+ end
48
+
49
+ #@options[:engine] = "mabmapper/aleph_mab_xml"
50
+ #opts.on( '-e', '--engine ENGINE', 'Normalization engine (Default: aleph_mab_xml)' ) do |engine|
51
+ # @options[:engine] = engine
52
+ #end
53
+
54
+ @options[:no_of_procs] = 1
55
+ opts.on( '-n', '--number-of-procs NUM', Integer, "Use NUM parallel procs [Default: 1]" ) do |n|
56
+ @options[:no_of_procs] = n
57
+ end
58
+
59
+ @options[:writer] = Mabmapper::TarWriter
60
+ opts.on( '-w', '--writer WRITER', "Use specified writer (elasticsearch|tar) [Default: tar]" ) do |writer|
61
+ @options[:writer] = Mabmapper::ElasticSearchWriter if writer.downcase == 'elasticsearch'
62
+ end
63
+
64
+ opts.on( '-h', '--help', 'Display this screen' ) do
65
+ puts opts
66
+ exit
67
+ end
68
+ end
69
+ optparse.parse!
70
+
71
+ # Check for required file arguments
72
+ (puts optparse.help ; exit) unless ARGV.present?
73
+ rescue OptionParser::ParseError => e
74
+ puts e.message
75
+ (puts optparse.help ; exit)
76
+ end
77
+
78
+ #
79
+ # Load normalization engine
80
+ #
81
+ def load_engine!
82
+ begin
83
+ engine_file = "mabmapper/aleph_mab_xml_engine" # TODO: Make me configurable
84
+ require engine_file
85
+ engine_class_name = "#{engine_file}".classify
86
+ @engine = engine_class_name.constantize.new
87
+ log "#{engine_class_name} loaded!"
88
+ rescue LoadError
89
+ log "Error loading engine #{engine_file}."
90
+ exit 1
91
+ end
92
+ end
93
+
94
+ #
95
+ # Process the input files
96
+ #
97
+ def process_files!
98
+ max_processes = @options[:no_of_procs]
99
+
100
+ # Prepare joblists for each process
101
+ joblist = max_processes.times.map do |i|
102
+ step = (ARGV.size/max_processes.to_f).ceil
103
+ ARGV.slice(i * step, step)
104
+ end.compact
105
+
106
+ # Run joblist
107
+ joblist.each_with_index do |list, index|
108
+ fork do
109
+ Thread.current[:name] = "Process #{index}"
110
+ list.each do |file|
111
+ process_file(file)
112
+ end
113
+ end
114
+ end
115
+
116
+ Process.waitall
117
+
118
+ log "FINISHED"
119
+ end
120
+
121
+ private
122
+
123
+ def process_file(file)
124
+ case
125
+ when file.end_with?('.tar') then process_tar_file(file)
126
+ when file.end_with?('.tar.gz') then process_tar_gz_file(file)
127
+ else process_default_file(file)
128
+ end
129
+ end
130
+
131
+ def process_tar_file(file)
132
+ writer = if output_dir
133
+ out_file = @options[:writer].out_file(output_dir, File.basename(file))
134
+ @options[:writer].new(File.open(out_file, 'w'))
135
+ end
136
+
137
+ tarReader = Gem::Package::TarReader.new(File.open(file, 'r'))
138
+ tarReader.each do |entry|
139
+ if entry.file?
140
+ log "Processing file #{entry.full_name} from archive #{file}"
141
+ result = @engine.process(entry.full_name, entry.read, archive: file)
142
+
143
+ writer.add_file(entry.full_name, 0644) do |f|
144
+ f.write(result.to_xml)
145
+ end if writer
146
+
147
+ log "Result for #{entry.full_name} from archive #{file}\n#{result.to_xml(@options[:debug_fields])}\n" if @options[:debug]
148
+ end
149
+ end
150
+
151
+ writer.close if writer
152
+ end
153
+
154
+ def process_tar_gz_file(file)
155
+ if output_dir
156
+ out_file = @options[:writer].out_file(output_dir, File.basename(file), will_be_gziped: true)
157
+ fileIO = File.open(out_file, "w")
158
+ gzipWriter = Zlib::GzipWriter.new(fileIO)
159
+ writer = @options[:writer].new(gzipWriter)
160
+ end
161
+
162
+ begin
163
+ tarReader = Gem::Package::TarReader.new(Zlib::GzipReader.open(file))
164
+ tarReader.each do |entry|
165
+ if entry.file?
166
+ log "Processing file #{entry.full_name} from archive #{file}"
167
+ result = @engine.process(entry.full_name, entry.read, archive: file)
168
+
169
+ xml_result = result.to_xml
170
+ writer.add_file_simple(entry.full_name, 0644, xml_result.bytesize) do |f|
171
+ f.write(xml_result)
172
+ end if writer
173
+
174
+ log "Result for #{entry.full_name} from archive #{file}\n#{result.to_xml(@options[:debug_fields])}\n" if @options[:debug]
175
+ end
176
+ end
177
+ ensure
178
+ writer.close if writer
179
+ gzipWriter.close if gzipWriter && !gzipWriter.closed?
180
+ end
181
+ end
182
+
183
+ def process_default_file(file)
184
+ log "Processing file #{file}"
185
+ result = @engine.process(file, File.open(file, "r").read)
186
+
187
+ if output_dir
188
+ out_file = File.join(output_dir, File.basename(file))
189
+ File.open(out_file, 'w') { |f| f.write(result.to_xml) }
190
+ end
191
+
192
+ log "Result for #{file}\n#{result.to_xml(@options[:debug_fields])}\n" if @options[:debug]
193
+ end
194
+
195
+ def output_dir
196
+ if @options[:output_dir]
197
+ dir = File.expand_path(@options[:output_dir])
198
+ raise "No such dir #{dir}" unless Dir.exists?(dir)
199
+ return dir
200
+ end
201
+ end
202
+
203
+ def log(message)
204
+ unless @options[:silent]
205
+ current_thread_name = Thread.current[:name]
206
+ current_thread_indicator = "[#{current_thread_name}]"
207
+ if current_thread_name.present?
208
+ puts "#{current_thread_indicator} #{message}"
209
+ else
210
+ puts "#{message}"
211
+ end
212
+ end
213
+ end
214
+
215
+ end
216
+ end
@@ -0,0 +1,52 @@
1
+ require 'active_support/core_ext'
2
+ require 'libxml'
3
+ require 'oj'
4
+
5
+ #
6
+ # (compressed) bulk files can be posted using curl, e.g. with the following command
7
+ #
8
+ # gzip -c -d aleph.PRIMO.20120908.091506.1.es_bulk.gz | curl -XPOST 'localhost:9200/catalog/record/_bulk' --data-binary @-
9
+ #
10
+ module Mabmapper
11
+ class ElasticSearchWriter
12
+
13
+ def self.out_file(output_dir_name, file_name, options = {})
14
+ file_basename = File.basename(file_name).gsub(/\.tar.gz|\.tar|\.tgz/, '.es_bulk')
15
+ file_basename << '.gz' if options[:will_be_gziped] === true
16
+
17
+ File.join(output_dir_name, file_basename)
18
+ end
19
+
20
+ def initialize(io)
21
+ @io = io
22
+
23
+ # Set libxml as minixml backend to improve performance
24
+ ActiveSupport::XmlMini.backend = 'LibXML'
25
+ end
26
+
27
+ def add_file(name, mode) # :yields: io
28
+ yield self
29
+ end
30
+
31
+ def add_file_simple(name, mode, size) # :yields: io
32
+ yield self
33
+ end
34
+
35
+ def close
36
+ @io.close unless @io.closed?
37
+ end
38
+
39
+ # we assume that data is string serialized xml
40
+ def write(xml)
41
+ bulk = []
42
+ hash = Hash.from_xml(xml)
43
+
44
+ bulk.push(Oj.dump({ index: { _id: "#{hash['document']['id']}" }}, mode: :compat))
45
+ bulk.push(Oj.dump(hash['document'], mode: :compat))
46
+
47
+ # Beware, right positions of newlines is vital for elasticsearch bulk import
48
+ @io.write(bulk.join("\n") << "\n")
49
+ end
50
+
51
+ end
52
+ end
@@ -0,0 +1,112 @@
1
+ # coding: utf-8
2
+ module Mabmapper
3
+ class Engine
4
+
5
+ def process(input_filename, contents, options={archive: nil})
6
+ document_class = self.class.document_class_value
7
+ raise "The engine does't provide a document class. This is required." unless document_class.present?
8
+ document = document_class.new(contents)
9
+ fields.each do |field|
10
+ begin
11
+ field.send(:process, document)
12
+ rescue => e
13
+ puts "----------------------------"
14
+ puts "Error : #{e.message}"
15
+ puts "Field : #{field.name}"
16
+ puts "File : #{input_filename}"
17
+ puts "Archive: #{options[:archive]}" if options[:archive].present?
18
+ puts "----------------------------"
19
+ puts "Backtrace:"
20
+ puts e.backtrace
21
+ puts "----------------------------"
22
+ puts
23
+ end
24
+ end
25
+ self
26
+ end
27
+
28
+ def fields
29
+ self.class.fields
30
+ end
31
+
32
+ def to_xml(only_fields = [])
33
+ builder = Nokogiri::XML::Builder.new do |xml|
34
+ xml.document do
35
+ fields.each do |field|
36
+ if field.result.present? || field.result == false
37
+ next if only_fields.present? and not only_fields.include?(field.name)
38
+ if field.result.is_a?(Array)
39
+ #xml.send("#{field.name.downcase.pluralize}_") do
40
+ field.result.each do |result|
41
+ xml.send("#{field.name.downcase}_", result)
42
+ end
43
+ #end
44
+ else
45
+ xml.send("#{field.name.downcase}_", field.result)
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+ builder.to_xml
53
+ end
54
+
55
+ def to_hash
56
+ hash = {}
57
+ fields.each { |field| hash[field.name.downcase] = field.result }
58
+ hash
59
+ end
60
+
61
+ protected
62
+
63
+ class << self
64
+ cattr_accessor :document_class_value
65
+ cattr_accessor :fields
66
+
67
+ def document_class(value)
68
+ self.document_class_value = value
69
+ end
70
+
71
+ def field(name, &block)
72
+ self.fields ||= []
73
+ self.fields << Field.new(name, &block) if block_given?
74
+ end
75
+ end
76
+
77
+ # @see: http://www.dan-manges.com/blog/ruby-dsls-instance-eval-with-delegation
78
+ class Field
79
+ def initialize(name, &block)
80
+ @name = name.to_s
81
+ @proc = block
82
+ @engine = eval("self", block.binding)
83
+ end
84
+
85
+ attr_reader :name, :result, :doc
86
+
87
+ def ref(field_name)
88
+ field = @engine.fields.find{ |f| f.name == field_name.to_s }
89
+ raise "No such field #{field_name} in section #{section_name}" if field.blank?
90
+ field.result
91
+ end
92
+
93
+ def merge(value1, value2, options = {delimiter: ' ', wrap: nil})
94
+ v1 = [value1].map(&:presence).compact.join(options[:delimiter])
95
+ v2 = [value2].map(&:presence).compact.join(options[:delimiter])
96
+ v2 = wrap(v2, options[:wrap]) if v2.present? and options[:wrap].present?
97
+ [v1, v2].map(&:presence).compact.join(options[:delimiter])
98
+ end
99
+
100
+ def wrap(value, pattern)
101
+ pattern.gsub("@", value)
102
+ end
103
+
104
+ private
105
+
106
+ def process(document)
107
+ @doc = document
108
+ @result = instance_eval(&@proc)
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,53 @@
1
+ # coding: utf-8
2
+ require 'stringex'
3
+
4
+ module Mabmapper
5
+ module MabXml
6
+ class Document
7
+ include QueryHelper
8
+
9
+ def initialize(contents)
10
+ @xml = Nokogiri::XML(contents)
11
+ @xml.remove_namespaces!
12
+ end
13
+
14
+ #
15
+ # Returns the contents of a MAB control field as an array.
16
+ #
17
+ def controlfield(name)
18
+ xpath = "//controlfield[@tag='#{name}']"
19
+ result = @xml.at_xpath(xpath).try(:text)
20
+ result.present? ? result.chars.to_a.map{|e| (e=='|') ? nil : e} : []
21
+ end
22
+
23
+ # Normalize text based on the NACO rules
24
+ def naco_normalization(value)
25
+ return unless value.present?
26
+
27
+ # Downcase everything
28
+ value.downcase!
29
+
30
+ # Convert unicode [and accented ASCII] characters to their
31
+ # plain-text ASCII equivalents
32
+ Stringex::Localization.backend = :internal
33
+ Stringex::Localization.locale = :de
34
+ Stringex::Localization.store_translations(:de, :transliterations, {"ü" => "ue", "ä" => "ae", "ö" => "oe", "ß" => "ss"}) if Stringex::Localization.backend.translations.blank?
35
+ value = value.to_ascii
36
+
37
+ # Convert special chars to spaces
38
+ value.gsub!(/[,$~^%*\/?@.:;<>{}!\(\)\-]/, ' ')
39
+
40
+ # Convert special chars to spaces
41
+ value.gsub!(/[\[\]\|]/, '')
42
+
43
+ # Remove leading and trailing spaces
44
+ value.strip!
45
+
46
+ # Condense multiple spaces
47
+ value.gsub!(/\s+/, ' ')
48
+
49
+ value
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,43 @@
1
+ module Mabmapper
2
+ module MabXml
3
+ class Field
4
+ attr_reader :name, :subfields
5
+
6
+ def initialize(name)
7
+ @name = name
8
+ @subfields = []
9
+ end
10
+
11
+ def add_subfield(subfield)
12
+ raise if subfield.blank? or not subfield.is_a?(Subfield)
13
+
14
+ @subfields << subfield
15
+ end
16
+
17
+ def values(options = {})
18
+ options.reverse_merge!(join_subfields: nil)
19
+ values = @subfields.map(&:value)
20
+
21
+ join = options[:join_subfields]
22
+ join ? values.join((!!join==join) ? nil : join.to_s) : values
23
+ end
24
+
25
+ def value(options = {})
26
+ options.reverse_merge!(join_subfields: ' ')
27
+ values(options)
28
+ end
29
+
30
+ def has_subfield?(subfield)
31
+ subfields.any?{|s| s.name.to_s == subfield.to_s}
32
+ end
33
+
34
+ def get_subfield(subfield)
35
+ subfields.select{|s| s.name.to_s == subfield.to_s}.try(:first)
36
+ end
37
+
38
+ def to_s
39
+ value
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,25 @@
1
+ module Mabmapper
2
+ module MabXml
3
+ class Query
4
+ include QueryHelper
5
+
6
+ attr_reader :xml, :fields, :subfields
7
+
8
+ def initialize(xml)
9
+ @xml = xml
10
+ @fields = []
11
+ @subfields = []
12
+ end
13
+
14
+ def add_field(name, options={})
15
+ @fields << {name: name, options: options.reverse_merge(ind1: nil, ind2: nil)}
16
+ self
17
+ end
18
+
19
+ def add_subfield(name)
20
+ @subfields << {name: name}
21
+ self
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,101 @@
1
+ module Mabmapper
2
+ module MabXml
3
+ module QueryHelper
4
+
5
+ def field(name_or_enum, options={})
6
+ if name_or_enum.is_a?(String)
7
+ query.add_field(name_or_enum, options)
8
+ elsif name_or_enum.is_a?(Enumerable)
9
+ name_or_enum.inject(query) { |q, name| q.add_field(name, options) }
10
+ end
11
+ end
12
+
13
+ def subfield(name_or_enum)
14
+ if name_or_enum.is_a?(String)
15
+ query.add_subfield(name_or_enum)
16
+ elsif name_or_enum.is_a?(Enumerable)
17
+ name_or_enum.inject(query) { |q, name| q.add_subfield(name) }
18
+ end
19
+ end
20
+
21
+ def get
22
+ results = ResultSet.new
23
+
24
+ query.xml.xpath(fields_xpath).each do |field_xml|
25
+ name = field_xml.attribute('tag').to_s
26
+ field = Field.new(name)
27
+
28
+ field_xml.xpath(subfields_xpath).each do |subfield_xml|
29
+ name = subfield_xml.attribute('code').to_s
30
+ value = subfield_xml.text.presence
31
+
32
+ subfield = Subfield.new(name, value)
33
+ field.add_subfield(subfield)
34
+ end
35
+
36
+ results.add_field(field)
37
+ end
38
+
39
+ results
40
+ end
41
+
42
+ private
43
+
44
+ def query
45
+ self.is_a?(Query) ? self : Query.new(@xml)
46
+ end
47
+
48
+ def fields_xpath
49
+ options = query.fields.map do |f|
50
+ name = f[:name]
51
+ ind1 = indicator_xpath("ind1", f[:options][:ind1])
52
+ ind2 = indicator_xpath("ind2", f[:options][:ind2])
53
+
54
+ s = []
55
+ s << "@tag='#{name}'"
56
+ s << ind1 if ind1
57
+ s << ind2 if ind2
58
+
59
+ "(#{s.join(' and ')})"
60
+ end.join(' or ')
61
+
62
+ "/OAI-PMH/ListRecords/record/metadata/record/datafield[#{options}]"
63
+ end
64
+
65
+ def subfields_xpath
66
+ global_negation = false
67
+
68
+ options = query.subfields.map do |f|
69
+ name = f[:name]
70
+
71
+ negation = name.starts_with?('-') and name.length > 1
72
+ name = negation ? name.slice(1..-1) : name
73
+ global_negation = true if negation
74
+
75
+ negation ? "not(@code='#{name}')" : "@code='#{name}'"
76
+ end
77
+
78
+ options = global_negation ? options.join(' and ') : options.join(' or ')
79
+
80
+ options.present? ? "subfield[#{options}]" : "subfield"
81
+ end
82
+
83
+ def indicator_xpath(name, value)
84
+ global_negation = false
85
+
86
+ options = [*value].map do |value|
87
+ if value
88
+ negation = value.starts_with?('-') and value.length > 1
89
+ global_negation = true if negation
90
+
91
+ negation ? "not(@#{name}='#{value.slice(1..-1)}')" : "@#{name}='#{value}'"
92
+ end
93
+ end
94
+
95
+ options = global_negation ? options.join(' and ') : options.join(' or ')
96
+
97
+ options.present? ? "(#{options})" : nil
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,34 @@
1
+ module Mabmapper
2
+ module MabXml
3
+ class ResultSet
4
+ include Enumerable
5
+ attr_reader :fields
6
+
7
+ def initialize
8
+ @fields = []
9
+ end
10
+
11
+ def add_field(field)
12
+ raise if field.blank? or not field.is_a?(Field)
13
+
14
+ @fields << field
15
+ end
16
+
17
+ def each
18
+ @fields.each{|i| yield i}
19
+ end
20
+
21
+ def values(options = {})
22
+ @fields.map{|f| f.values(options)}
23
+ end
24
+
25
+ def value(options = {})
26
+ @fields.first.try(:value, options)
27
+ end
28
+
29
+ def to_s
30
+ value
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,12 @@
1
+ module Mabmapper
2
+ module MabXml
3
+ class Subfield
4
+ attr_reader :name, :value
5
+
6
+ def initialize(name, value)
7
+ @name = name
8
+ @value = value
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,6 @@
1
+ require 'mabmapper/mab_xml/query_helper'
2
+ require 'mabmapper/mab_xml/query'
3
+ require 'mabmapper/mab_xml/result_set'
4
+ require 'mabmapper/mab_xml/field'
5
+ require 'mabmapper/mab_xml/subfield'
6
+ require 'mabmapper/mab_xml/document'
@@ -0,0 +1,29 @@
1
+ module Mabmapper
2
+ class TarWriter
3
+
4
+ def self.out_file(output_dir_name, file_name, options = {})
5
+ File.join(output_dir_name, File.basename(file_name))
6
+ end
7
+
8
+ def initialize(io)
9
+ @tar_writer = Gem::Package::TarWriter.new(io)
10
+ end
11
+
12
+ def add_file(name, mode) # :yields: io
13
+ @tar_writer.add_file(name, mode) do |f|
14
+ yield f
15
+ end
16
+ end
17
+
18
+ def add_file_simple(name, mode, size) # :yields: io
19
+ @tar_writer.add_file_simple(name, mode, size) do |f|
20
+ yield f
21
+ end
22
+ end
23
+
24
+ def close
25
+ @tar_writer.close
26
+ end
27
+
28
+ end
29
+ end
@@ -0,0 +1,3 @@
1
+ module Mabmapper
2
+ VERSION = "1.0.0.pre15"
3
+ end