mead 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/lib/mead/ead.rb ADDED
@@ -0,0 +1,215 @@
1
+ module Mead
2
+ class Ead
3
+ # factor out :baseurl, :file and :url into an options object?
4
+ attr_accessor :containers, :ead, :baseurl, :file, :url, :doc, :eadid, :series_present,
5
+ :c01s_series_answer
6
+
7
+ # options include :file and :base_url
8
+ def initialize(opts={})
9
+ @eadid = opts[:eadid] || nil
10
+ @file = opts[:file] || nil
11
+ @baseurl = opts[:baseurl] || nil
12
+ @url = opts[:url] || nil
13
+ @containers = []
14
+
15
+ get_ead
16
+ find_eadid unless @eadid
17
+ crawl_for_containers
18
+ end
19
+
20
+ def get_ead
21
+ if @eadid.nil? and @url.nil? and @file.nil? and @baseurl
22
+ raise 'Cannot get EAD based on params.'
23
+ end
24
+ if @file and @file.is_a? File
25
+ @file.rewind if @file.eof?
26
+ @ead = @file.read
27
+ elsif @url
28
+ @ead = open(@url).read
29
+ elsif @baseurl
30
+ @ead = open(File.join(@baseurl, @eadid + '.xml')).read
31
+ end
32
+ @doc = Nokogiri::XML(@ead)
33
+ end
34
+
35
+ def find_eadid
36
+ begin
37
+ @eadid = @doc.xpath('//xmlns:eadid').first.text
38
+ rescue => e
39
+ raise 'Need an eadid and none has been given and it cannot be found in the EAD XML.'
40
+ end
41
+ end
42
+
43
+ def crawl_for_containers
44
+ c01s.each_with_index do |c, i|
45
+ dids = c.xpath('.//xmlns:container').map{|c| c.parent}.uniq
46
+ #c.xpath('xmlns:c02/xmlns:did').map do |did|
47
+ dids.map do |did|
48
+ info = {}
49
+ if c01s_series?
50
+ info[:series] = i + 1 # if all the c01s are at the file level this fails
51
+ else
52
+ info[:series] = 0
53
+ end
54
+ info[:mead] = create_mead(did, i)
55
+ info[:title] = concat_title(did)
56
+ # FIXME
57
+ info[:containers] = text_containers(did)
58
+ @containers << info
59
+ end
60
+ end
61
+ end
62
+
63
+ def text_containers(did)
64
+ did.xpath('xmlns:container').map do |container|
65
+ text = ''
66
+ text << container.attribute('type').text + ' ' if container.attribute('type')
67
+ text << container.text if container.text
68
+ text
69
+ end
70
+ end
71
+
72
+ def c01s
73
+ @doc.xpath('//xmlns:dsc/xmlns:c01')
74
+ end
75
+
76
+ def c01s_series?
77
+ @c01s_series_answer ||= c01s.length == series_c01s.length
78
+ end
79
+
80
+ def series_c01s
81
+ @doc.xpath("//xmlns:dsc/xmlns:c01[@level='series']")
82
+ end
83
+
84
+ def create_mead(did, i)
85
+ mead = [@eadid.dup]
86
+ if c01s_series?
87
+ mead << "%03d" % (i + 1) #series
88
+ else
89
+ mead << '001'
90
+ end
91
+ begin
92
+ mead << specific_containers(did)
93
+ rescue
94
+ return @mead = mead.flatten.join('-')
95
+ end
96
+ mead << '001' # stub for first record
97
+ @mead = mead.flatten.join('-')
98
+ end
99
+
100
+ def concat_title(did)
101
+ title = ''
102
+ title << did.xpath('xmlns:unittitle').text if did.xpath('xmlns:unittitle')
103
+ if did.xpath('xmlns:unittitle') and did.xpath('xmlns:unitdate') and !did.xpath('xmlns:unitdate').text.empty?
104
+ title << ', ' << did.xpath('xmlns:unitdate').text
105
+ end
106
+ if did.xpath('xmlns:unitid') and !did.xpath('xmlns:unitid').text.empty?
107
+ title << ' (' + did.xpath('xmlns:unitid').text + ')'
108
+ end
109
+ title
110
+ end
111
+
112
+ def specific_containers(did)
113
+ containers = did.xpath('xmlns:container')
114
+ container_values = []
115
+ if containers.length == 1
116
+ container_values << make_box(did.xpath('xmlns:container')[0])
117
+ container_values << '000'
118
+ elsif containers.length == 2
119
+ container_values << make_box(did.xpath('xmlns:container')[0])
120
+ container_values << make_box(did.xpath('xmlns:container')[1],3)
121
+ elsif containers.length > 2
122
+ raise "I can't create a mead identifier with more than 2 containers in a did!"
123
+ else
124
+ raise "Do we really have zero containers?!"
125
+ end
126
+ return container_values
127
+ end
128
+
129
+ def make_box(container, padding=4)
130
+ # FIXME: pad based on first part of range for folder +++
131
+ padder = "%0" + padding.to_s + 's'
132
+ text = (padder % container.text).gsub(' ','0').gsub('.','_').gsub('-', '~')
133
+ container_type(container) + text
134
+ end
135
+
136
+ def container_type(container)
137
+ match =''
138
+ CONTAINER_MAPPING.each do |k,v|
139
+ if container.attribute('type').text == v or
140
+ container.attribute('type').text.downcase == v
141
+ match = k
142
+ end
143
+ end
144
+ match
145
+ end
146
+
147
+ def to_csv
148
+ Mead::Ead.to_csv(self.containers)
149
+ end
150
+
151
+ def self.to_csv(container_list)
152
+ if CSV.const_defined? :Reader
153
+ csv_class = FasterCSV # old CSV was loaded
154
+ else
155
+ csv_class = CSV # use CSV from 1.9
156
+ end
157
+ csv_string = csv_class.generate do |csv|
158
+ # FIXME
159
+ csv << ['mead','title','series', 'containers']
160
+ #csv << ['mead','title','series']
161
+ container_list.each do |container|
162
+ csv << [container[:mead], container[:title], container[:series], container[:containers].join(', ')]
163
+ #csv << [container[:mead], container[:title], container[:series]]
164
+ end
165
+ end
166
+ end
167
+
168
+ def valid?
169
+ if unique_meads.length == @containers.length
170
+ if short_meads?
171
+ false
172
+ else
173
+ true
174
+ end
175
+ else
176
+ false
177
+ end
178
+ end
179
+
180
+ def unique_meads
181
+ @containers.collect{|container| container[:mead]}.uniq
182
+ end
183
+
184
+ def long_meads
185
+ unique_meads.select{|m| m.split('-').length > 2}
186
+ end
187
+
188
+ def short_meads
189
+ unique_meads.select{|m| m.split('-').length <= 2}
190
+ end
191
+
192
+ def short_meads?
193
+ if unique_meads.length == long_meads.length
194
+ false
195
+ else
196
+ true
197
+ end
198
+ end
199
+
200
+ def invalid
201
+ duplicates = dups
202
+ @containers.select{|container| duplicates.include?(container[:mead])}
203
+ end
204
+
205
+ def dups
206
+ meads.inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys.sort
207
+ end
208
+
209
+ def meads
210
+ @containers.collect{|container| container[:mead]}
211
+ end
212
+
213
+ end
214
+ end
215
+
@@ -0,0 +1,46 @@
1
+ module Mead
2
+ class EadValidator
3
+ attr_accessor :directory, :valid, :invalid, :invalid_full
4
+
5
+ # Creates a new EadValidator when given the path to a directory as a String
6
+ def initialize(directory)
7
+ @directory = directory
8
+ @valid = []
9
+ @invalid = []
10
+ @invalid_full = []
11
+ end
12
+
13
+ def validate!
14
+ files = Dir.glob(File.join(@directory, '*.xml')).sort
15
+ threads = []
16
+ files.map do |path|
17
+ threads << Thread.new(path) do |path_t|
18
+ eadid = File.basename(path_t, '.xml')
19
+ begin
20
+ ead = Mead::Ead.new({:file => File.open(path_t), :eadid => eadid})
21
+ rescue => e
22
+ record_invalid(eadid, ead, e)
23
+ next
24
+ end
25
+ if ead.valid?
26
+ @valid << eadid
27
+ else
28
+ record_invalid(eadid, ead)
29
+ end
30
+ end
31
+ end
32
+ threads.each { |thread| thread.join }
33
+ metadata
34
+ end
35
+
36
+ def record_invalid(eadid, ead, error=nil)
37
+ @invalid << eadid
38
+ @invalid_full << {:eadid => eadid, :error => error, :dups => ead.dups, :containers => ead.invalid}
39
+ end
40
+
41
+ def metadata
42
+ {:valid => @valid.sort, :invalid => @invalid.sort}
43
+ end
44
+ end
45
+ end
46
+
@@ -0,0 +1,198 @@
1
+ module Mead
2
+ class Extractor
3
+
4
+ attr_accessor :mead, :dsc, :series, :doc, :ead_location, :stack, :node
5
+
6
+ # a stack contains the data (unittitle, unitdate) from the identifier's
7
+ # container all the way through to parent containers. The order is from
8
+ # most specific to least specific
9
+ def initialize(mead)
10
+ @stack = []
11
+ get_mead_obj(mead)
12
+ self
13
+ end
14
+
15
+ def extract
16
+ get_ead_location
17
+ eadxml = get_eadxml
18
+ @doc = Nokogiri::XML(eadxml)
19
+ do_extraction
20
+ @mead.metadata = @stack # make sure the metadata always gets cached to the identifier
21
+ return @stack
22
+ end
23
+
24
+ private
25
+
26
+ def get_mead_obj(mead)
27
+ if mead.is_a? Mead::Identifier
28
+ @mead = mead
29
+ else
30
+ raise "is not a Mead::Identifier"
31
+ end
32
+ end
33
+
34
+ def get_ead_location
35
+ if mead.ead_location
36
+ @ead_location = mead.ead_location
37
+ else
38
+ raise 'Cannot extract because no Ead location defined in Mead::Identifier.'
39
+ end
40
+ end
41
+
42
+ def do_extraction
43
+ get_dsc
44
+ get_series
45
+ find_node
46
+ push_to_stack(@node)
47
+ end
48
+
49
+ def push_to_stack(node)
50
+ return nil if node.nil?
51
+ additional_did = {:unittitle => container_field('unittitle', node),
52
+ :unitdate => container_field('unitdate', node),
53
+ :level => container_level(node),
54
+ :unitid => container_field('unitid', node)
55
+ }
56
+ did_location_text = did_location(node)
57
+ additional_did[:item_location] = did_location_text if did_location_text
58
+
59
+ add_containers(additional_did, node)
60
+
61
+ if additional_did[:level] == 'series'
62
+ additional_did[:series_number] = series_number(node)
63
+ end
64
+ if @stack.last == additional_did
65
+ return
66
+ end
67
+ @stack << additional_did
68
+ if !node.parent.parent.xpath('xmlns:did').empty?
69
+ push_to_stack(node.parent.parent.xpath('xmlns:did')[0])
70
+ end
71
+ end
72
+
73
+ def add_containers(hash, node)
74
+ if !node.xpath('./xmlns:container').empty?
75
+ hash[:containers] = []
76
+ node.xpath('./xmlns:container').each do |container|
77
+ c = Mead::Container.new
78
+ c.type = container.attribute('type').text if container.attribute('type')
79
+ c.label = container.attribute('label').text if container.attribute('label')
80
+ c.text = container.text if !container.text.empty?
81
+ hash[:containers] << c
82
+ end
83
+ end
84
+ end
85
+
86
+ def did_location(did)
87
+ location = []
88
+ did.xpath('./xmlns:container').each do |container|
89
+ location << container.attribute('type').text + ' ' + container.text
90
+ end
91
+ unless location.empty?
92
+ location.join(', ')
93
+ end
94
+ end
95
+
96
+ def get_series
97
+ c01_series = @dsc.xpath(".//xmlns:c01[@level='series']")
98
+ if c01_series and !c01_series.empty?
99
+ c01_series.each_with_index do |c01, i|
100
+ if mead.series.to_i == i + 1
101
+ @series = c01
102
+ end
103
+ end
104
+ else
105
+ @series = @dsc
106
+ end
107
+ end
108
+
109
+ def folder_types
110
+ types = "@type='#{@mead.folder[:type]}' or @type='#{@mead.folder[:type].capitalize}'"
111
+ if @mead.folder[:type] == 'folder'
112
+ types << " or @type='envelope' or @type='Envelope'"
113
+ end
114
+ types
115
+ end
116
+
117
+ def find_node(folder=true)
118
+ #dsc_dids = series.xpath('.//xmlns:did')
119
+ if @mead.container[:type]
120
+ container_set_xpath = ".//xmlns:container[text()='#{@mead.container[:number]}' and (@type='#{@mead.container[:type]}' or @type='#{@mead.container[:type].capitalize}')]"
121
+ if folder and @mead.folder
122
+ container_set_xpath << "/../xmlns:container[text()='#{@mead.folder[:number]}' and (#{folder_types})]"
123
+ end
124
+ containers = series.xpath(container_set_xpath)
125
+ #matching_dids
126
+ if containers.length > 1
127
+ raise "too many matching nodes!"
128
+ elsif containers.length == 0
129
+ # Second chance to handle legacy identifiers where a blank folder was given as 001
130
+ if @mead.folder[:number] == '1'
131
+ #@mead.folder = nil #TODO: check do 000 folders get automatically turned to nil when the mead is created?
132
+ find_node(false)
133
+ else
134
+ raise "no matching dids!"
135
+ end
136
+ else
137
+ @node = containers[0].parent
138
+ end
139
+ else
140
+ return nil
141
+ end
142
+ end
143
+
144
+ def container_field(field, node)
145
+ xpath = 'xmlns:' + field
146
+ if node.xpath(xpath)
147
+ text = node.xpath(xpath).text
148
+ if text.nil? or text.empty?
149
+ return nil
150
+ else
151
+ return text
152
+ end
153
+ else
154
+ nil
155
+ end
156
+ end
157
+
158
+ def container_level(node)
159
+ if node.parent['level']
160
+ node.parent['level']
161
+ else
162
+ nil
163
+ end
164
+ end
165
+
166
+ def series_number(node)
167
+ parent_node = node.parent
168
+ siblings = node.document.xpath('//xmlns:c01')
169
+ length = siblings.length
170
+ index = siblings.index(parent_node) + 1
171
+ index
172
+ end
173
+
174
+ def get_dsc
175
+ @dsc = @doc.xpath('//xmlns:dsc')
176
+ end
177
+
178
+ def get_eadxml
179
+ tries = 5
180
+ begin
181
+ if @ead_location.respond_to? :read
182
+ @ead_location.read
183
+ else
184
+ return open(@ead_location).read
185
+ end
186
+ rescue => e
187
+ tries -= 1
188
+ if tries > 0
189
+ retry
190
+ else
191
+ raise "Could not get record by eadid! " + e.inspect
192
+ end
193
+ end
194
+ end
195
+
196
+ end
197
+ end
198
+
@@ -0,0 +1,112 @@
1
+ module Mead
2
+ class Identifier
3
+
4
+ attr_accessor :mead, :eadid, :series, :container, :folder, :sequence, :page,
5
+ :ead_location, :metadata
6
+ include Mead::Validations
7
+ validates_format_of_mead
8
+ validates_presence_of_mead
9
+ validates_numericality_of_mead :sequence, :page
10
+
11
+ # If a location is given then extraction can take place
12
+ def initialize(mead, ead_location=nil)
13
+ @mead = mead
14
+ @metadata = nil
15
+ parse_mead 'eadid', 'series', 'container', 'folder', 'sequence'
16
+ @ead_location = parse_ead_location(ead_location)
17
+ split_container
18
+ split_folder
19
+ split_page
20
+ clean_zeros 'series', 'sequence', 'page'
21
+ self
22
+ end
23
+
24
+ def parse_mead(*args)
25
+ parts = @mead.split('-')
26
+ args.each_with_index do |field, i|
27
+ instance_variable_set('@' + field, parts[i])
28
+ end
29
+ end
30
+
31
+ def split_container
32
+ type = CONTAINER_MAPPING[ @container[0,2] ]
33
+ number = strip_zeros(container_number_transforms(@container[2,10]))
34
+ @container = {:type=> type, :number=> number}
35
+ end
36
+
37
+ def split_folder
38
+ if CONTAINER_MAPPING.keys.include?(@folder[0,2])
39
+ type = CONTAINER_MAPPING[ @folder[0,2] ]
40
+ number = strip_zeros(container_number_transforms(@folder[2,10]))
41
+ else
42
+ type = 'folder'
43
+ number = strip_zeros(container_number_transforms(@folder))
44
+ end
45
+ if number.nil? or (number and number.empty?)
46
+ @folder = nil
47
+ else
48
+ @folder = {:type=> type, :number=> number}
49
+ end
50
+ end
51
+
52
+ def container_number_transforms(string)
53
+ string.gsub('_','.').gsub('~', '-').gsub(/^0*/,'')
54
+ end
55
+
56
+ def clean_zeros(*args)
57
+ args.each do |field|
58
+ instance_var = instance_variable_get('@' + field)
59
+ if instance_var
60
+ cleaned_value = strip_zeros(instance_var)
61
+ instance_variable_set('@' + field, cleaned_value)
62
+ end
63
+ end
64
+ end
65
+
66
+ def strip_zeros(num)
67
+ num.sub(/^0+/,'')
68
+ end
69
+
70
+ def split_page
71
+ @sequence, @page = sequence.split('_')
72
+ end
73
+
74
+ def parse_ead_location(loc)
75
+ return nil if loc.nil?
76
+ if loc
77
+ if loc.is_a? File
78
+ loc.rewind if loc.eof?
79
+ @ead_location = loc
80
+ elsif loc.include?('http://')
81
+ if loc.include?(@eadid)
82
+ @ead_location = loc
83
+ else
84
+ @ead_location = File.join(loc, @eadid + '.xml')
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ def extract
91
+ @metadata = Mead::Extractor.new(self).extract
92
+ self
93
+ end
94
+
95
+ # def ead_has_series?
96
+ # if series > 1
97
+ # true
98
+ # else
99
+ # if
100
+ # false
101
+ # end
102
+ # end
103
+
104
+ # def replace_underscores(*args)
105
+ # args.each do |field|
106
+ # value = instance_variable_get('@' + field).gsub('_', '.')
107
+ # instance_variable_set('@' + field, value)
108
+ # end
109
+ # end
110
+
111
+ end
112
+ end
@@ -0,0 +1,38 @@
1
+ module Mead
2
+ module TrollopOptions
3
+ def check_options(opts)
4
+ number_of_get_methods = [:baseurl, :url, :file].inject(0) do |memo, option|
5
+ temp_memo = memo
6
+ temp_memo += 1 if opts[option]
7
+ temp_memo
8
+ end
9
+ if number_of_get_methods > 1
10
+ Trollop::die 'Must specify ONLY one way to get the Ead XML'
11
+ elsif number_of_get_methods == 0
12
+ Trollop::die 'Must specify at least one way to get the Ead XML'
13
+ end
14
+ end
15
+
16
+ def get_location_options(opts)
17
+ location_options = {}
18
+ if opts[:baseurl]
19
+ location_options[:baseurl] = opts[:baseurl]
20
+ elsif opts[:url]
21
+ location_options[:url] = opts[:url]
22
+ elsif opts[:file]
23
+ location_options[:file] = File.open(opts[:file])
24
+ end
25
+ location_options
26
+ end
27
+
28
+ def get_location(opts)
29
+ if opts[:baseurl]
30
+ opts[:baseurl]
31
+ elsif opts[:url]
32
+ opts[:url]
33
+ elsif opts[:file]
34
+ File.open(opts[:file])
35
+ end
36
+ end
37
+ end
38
+ end