mead 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/mead/ead.rb ADDED
@@ -0,0 +1,215 @@
1
+ module Mead
2
+ class Ead
3
+ # factor out :baseurl, :file and :url into an options object?
4
+ attr_accessor :containers, :ead, :baseurl, :file, :url, :doc, :eadid, :series_present,
5
+ :c01s_series_answer
6
+
7
+ # options include :file and :base_url
8
+ def initialize(opts={})
9
+ @eadid = opts[:eadid] || nil
10
+ @file = opts[:file] || nil
11
+ @baseurl = opts[:baseurl] || nil
12
+ @url = opts[:url] || nil
13
+ @containers = []
14
+
15
+ get_ead
16
+ find_eadid unless @eadid
17
+ crawl_for_containers
18
+ end
19
+
20
+ def get_ead
21
+ if @eadid.nil? and @url.nil? and @file.nil? and @baseurl
22
+ raise 'Cannot get EAD based on params.'
23
+ end
24
+ if @file and @file.is_a? File
25
+ @file.rewind if @file.eof?
26
+ @ead = @file.read
27
+ elsif @url
28
+ @ead = open(@url).read
29
+ elsif @baseurl
30
+ @ead = open(File.join(@baseurl, @eadid + '.xml')).read
31
+ end
32
+ @doc = Nokogiri::XML(@ead)
33
+ end
34
+
35
+ def find_eadid
36
+ begin
37
+ @eadid = @doc.xpath('//xmlns:eadid').first.text
38
+ rescue => e
39
+ raise 'Need an eadid and none has been given and it cannot be found in the EAD XML.'
40
+ end
41
+ end
42
+
43
+ def crawl_for_containers
44
+ c01s.each_with_index do |c, i|
45
+ dids = c.xpath('.//xmlns:container').map{|c| c.parent}.uniq
46
+ #c.xpath('xmlns:c02/xmlns:did').map do |did|
47
+ dids.map do |did|
48
+ info = {}
49
+ if c01s_series?
50
+ info[:series] = i + 1 # if all the c01s are at the file level this fails
51
+ else
52
+ info[:series] = 0
53
+ end
54
+ info[:mead] = create_mead(did, i)
55
+ info[:title] = concat_title(did)
56
+ # FIXME
57
+ info[:containers] = text_containers(did)
58
+ @containers << info
59
+ end
60
+ end
61
+ end
62
+
63
+ def text_containers(did)
64
+ did.xpath('xmlns:container').map do |container|
65
+ text = ''
66
+ text << container.attribute('type').text + ' ' if container.attribute('type')
67
+ text << container.text if container.text
68
+ text
69
+ end
70
+ end
71
+
72
+ def c01s
73
+ @doc.xpath('//xmlns:dsc/xmlns:c01')
74
+ end
75
+
76
+ def c01s_series?
77
+ @c01s_series_answer ||= c01s.length == series_c01s.length
78
+ end
79
+
80
+ def series_c01s
81
+ @doc.xpath("//xmlns:dsc/xmlns:c01[@level='series']")
82
+ end
83
+
84
+ def create_mead(did, i)
85
+ mead = [@eadid.dup]
86
+ if c01s_series?
87
+ mead << "%03d" % (i + 1) #series
88
+ else
89
+ mead << '001'
90
+ end
91
+ begin
92
+ mead << specific_containers(did)
93
+ rescue
94
+ return @mead = mead.flatten.join('-')
95
+ end
96
+ mead << '001' # stub for first record
97
+ @mead = mead.flatten.join('-')
98
+ end
99
+
100
+ def concat_title(did)
101
+ title = ''
102
+ title << did.xpath('xmlns:unittitle').text if did.xpath('xmlns:unittitle')
103
+ if did.xpath('xmlns:unittitle') and did.xpath('xmlns:unitdate') and !did.xpath('xmlns:unitdate').text.empty?
104
+ title << ', ' << did.xpath('xmlns:unitdate').text
105
+ end
106
+ if did.xpath('xmlns:unitid') and !did.xpath('xmlns:unitid').text.empty?
107
+ title << ' (' + did.xpath('xmlns:unitid').text + ')'
108
+ end
109
+ title
110
+ end
111
+
112
+ def specific_containers(did)
113
+ containers = did.xpath('xmlns:container')
114
+ container_values = []
115
+ if containers.length == 1
116
+ container_values << make_box(did.xpath('xmlns:container')[0])
117
+ container_values << '000'
118
+ elsif containers.length == 2
119
+ container_values << make_box(did.xpath('xmlns:container')[0])
120
+ container_values << make_box(did.xpath('xmlns:container')[1],3)
121
+ elsif containers.length > 2
122
+ raise "I can't create a mead identifier with more than 2 containers in a did!"
123
+ else
124
+ raise "Do we really have zero containers?!"
125
+ end
126
+ return container_values
127
+ end
128
+
129
+ def make_box(container, padding=4)
130
+ # FIXME: pad based on first part of range for folder +++
131
+ padder = "%0" + padding.to_s + 's'
132
+ text = (padder % container.text).gsub(' ','0').gsub('.','_').gsub('-', '~')
133
+ container_type(container) + text
134
+ end
135
+
136
+ def container_type(container)
137
+ match =''
138
+ CONTAINER_MAPPING.each do |k,v|
139
+ if container.attribute('type').text == v or
140
+ container.attribute('type').text.downcase == v
141
+ match = k
142
+ end
143
+ end
144
+ match
145
+ end
146
+
147
+ def to_csv
148
+ Mead::Ead.to_csv(self.containers)
149
+ end
150
+
151
+ def self.to_csv(container_list)
152
+ if CSV.const_defined? :Reader
153
+ csv_class = FasterCSV # old CSV was loaded
154
+ else
155
+ csv_class = CSV # use CSV from 1.9
156
+ end
157
+ csv_string = csv_class.generate do |csv|
158
+ # FIXME
159
+ csv << ['mead','title','series', 'containers']
160
+ #csv << ['mead','title','series']
161
+ container_list.each do |container|
162
+ csv << [container[:mead], container[:title], container[:series], container[:containers].join(', ')]
163
+ #csv << [container[:mead], container[:title], container[:series]]
164
+ end
165
+ end
166
+ end
167
+
168
+ def valid?
169
+ if unique_meads.length == @containers.length
170
+ if short_meads?
171
+ false
172
+ else
173
+ true
174
+ end
175
+ else
176
+ false
177
+ end
178
+ end
179
+
180
+ def unique_meads
181
+ @containers.collect{|container| container[:mead]}.uniq
182
+ end
183
+
184
+ def long_meads
185
+ unique_meads.select{|m| m.split('-').length > 2}
186
+ end
187
+
188
+ def short_meads
189
+ unique_meads.select{|m| m.split('-').length <= 2}
190
+ end
191
+
192
+ def short_meads?
193
+ if unique_meads.length == long_meads.length
194
+ false
195
+ else
196
+ true
197
+ end
198
+ end
199
+
200
+ def invalid
201
+ duplicates = dups
202
+ @containers.select{|container| duplicates.include?(container[:mead])}
203
+ end
204
+
205
+ def dups
206
+ meads.inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys.sort
207
+ end
208
+
209
+ def meads
210
+ @containers.collect{|container| container[:mead]}
211
+ end
212
+
213
+ end
214
+ end
215
+
@@ -0,0 +1,46 @@
1
+ module Mead
2
+ class EadValidator
3
+ attr_accessor :directory, :valid, :invalid, :invalid_full
4
+
5
+ # Creates a new EadValidator when given the path to a directory as a String
6
+ def initialize(directory)
7
+ @directory = directory
8
+ @valid = []
9
+ @invalid = []
10
+ @invalid_full = []
11
+ end
12
+
13
+ def validate!
14
+ files = Dir.glob(File.join(@directory, '*.xml')).sort
15
+ threads = []
16
+ files.map do |path|
17
+ threads << Thread.new(path) do |path_t|
18
+ eadid = File.basename(path_t, '.xml')
19
+ begin
20
+ ead = Mead::Ead.new({:file => File.open(path_t), :eadid => eadid})
21
+ rescue => e
22
+ record_invalid(eadid, ead, e)
23
+ next
24
+ end
25
+ if ead.valid?
26
+ @valid << eadid
27
+ else
28
+ record_invalid(eadid, ead)
29
+ end
30
+ end
31
+ end
32
+ threads.each { |thread| thread.join }
33
+ metadata
34
+ end
35
+
36
+ def record_invalid(eadid, ead, error=nil)
37
+ @invalid << eadid
38
+ @invalid_full << {:eadid => eadid, :error => error, :dups => ead.dups, :containers => ead.invalid}
39
+ end
40
+
41
+ def metadata
42
+ {:valid => @valid.sort, :invalid => @invalid.sort}
43
+ end
44
+ end
45
+ end
46
+
@@ -0,0 +1,198 @@
1
+ module Mead
2
+ class Extractor
3
+
4
+ attr_accessor :mead, :dsc, :series, :doc, :ead_location, :stack, :node
5
+
6
+ # a stack contains the data (unittitle, unitdate) from the identifier's
7
+ # container all the way through to parent containers. The order is from
8
+ # most specific to least specific
9
+ def initialize(mead)
10
+ @stack = []
11
+ get_mead_obj(mead)
12
+ self
13
+ end
14
+
15
+ def extract
16
+ get_ead_location
17
+ eadxml = get_eadxml
18
+ @doc = Nokogiri::XML(eadxml)
19
+ do_extraction
20
+ @mead.metadata = @stack # make sure the metadata always gets cached to the identifier
21
+ return @stack
22
+ end
23
+
24
+ private
25
+
26
+ def get_mead_obj(mead)
27
+ if mead.is_a? Mead::Identifier
28
+ @mead = mead
29
+ else
30
+ raise "is not a Mead::Identifier"
31
+ end
32
+ end
33
+
34
+ def get_ead_location
35
+ if mead.ead_location
36
+ @ead_location = mead.ead_location
37
+ else
38
+ raise 'Cannot extract because no Ead location defined in Mead::Identifier.'
39
+ end
40
+ end
41
+
42
+ def do_extraction
43
+ get_dsc
44
+ get_series
45
+ find_node
46
+ push_to_stack(@node)
47
+ end
48
+
49
+ def push_to_stack(node)
50
+ return nil if node.nil?
51
+ additional_did = {:unittitle => container_field('unittitle', node),
52
+ :unitdate => container_field('unitdate', node),
53
+ :level => container_level(node),
54
+ :unitid => container_field('unitid', node)
55
+ }
56
+ did_location_text = did_location(node)
57
+ additional_did[:item_location] = did_location_text if did_location_text
58
+
59
+ add_containers(additional_did, node)
60
+
61
+ if additional_did[:level] == 'series'
62
+ additional_did[:series_number] = series_number(node)
63
+ end
64
+ if @stack.last == additional_did
65
+ return
66
+ end
67
+ @stack << additional_did
68
+ if !node.parent.parent.xpath('xmlns:did').empty?
69
+ push_to_stack(node.parent.parent.xpath('xmlns:did')[0])
70
+ end
71
+ end
72
+
73
+ def add_containers(hash, node)
74
+ if !node.xpath('./xmlns:container').empty?
75
+ hash[:containers] = []
76
+ node.xpath('./xmlns:container').each do |container|
77
+ c = Mead::Container.new
78
+ c.type = container.attribute('type').text if container.attribute('type')
79
+ c.label = container.attribute('label').text if container.attribute('label')
80
+ c.text = container.text if !container.text.empty?
81
+ hash[:containers] << c
82
+ end
83
+ end
84
+ end
85
+
86
+ def did_location(did)
87
+ location = []
88
+ did.xpath('./xmlns:container').each do |container|
89
+ location << container.attribute('type').text + ' ' + container.text
90
+ end
91
+ unless location.empty?
92
+ location.join(', ')
93
+ end
94
+ end
95
+
96
+ def get_series
97
+ c01_series = @dsc.xpath(".//xmlns:c01[@level='series']")
98
+ if c01_series and !c01_series.empty?
99
+ c01_series.each_with_index do |c01, i|
100
+ if mead.series.to_i == i + 1
101
+ @series = c01
102
+ end
103
+ end
104
+ else
105
+ @series = @dsc
106
+ end
107
+ end
108
+
109
+ def folder_types
110
+ types = "@type='#{@mead.folder[:type]}' or @type='#{@mead.folder[:type].capitalize}'"
111
+ if @mead.folder[:type] == 'folder'
112
+ types << " or @type='envelope' or @type='Envelope'"
113
+ end
114
+ types
115
+ end
116
+
117
+ def find_node(folder=true)
118
+ #dsc_dids = series.xpath('.//xmlns:did')
119
+ if @mead.container[:type]
120
+ container_set_xpath = ".//xmlns:container[text()='#{@mead.container[:number]}' and (@type='#{@mead.container[:type]}' or @type='#{@mead.container[:type].capitalize}')]"
121
+ if folder and @mead.folder
122
+ container_set_xpath << "/../xmlns:container[text()='#{@mead.folder[:number]}' and (#{folder_types})]"
123
+ end
124
+ containers = series.xpath(container_set_xpath)
125
+ #matching_dids
126
+ if containers.length > 1
127
+ raise "too many matching nodes!"
128
+ elsif containers.length == 0
129
+ # Second chance to handle legacy identifiers where a blank folder was given as 001
130
+ if @mead.folder[:number] == '1'
131
+ #@mead.folder = nil #TODO: check do 000 folders get automatically turned to nil when the mead is created?
132
+ find_node(false)
133
+ else
134
+ raise "no matching dids!"
135
+ end
136
+ else
137
+ @node = containers[0].parent
138
+ end
139
+ else
140
+ return nil
141
+ end
142
+ end
143
+
144
+ def container_field(field, node)
145
+ xpath = 'xmlns:' + field
146
+ if node.xpath(xpath)
147
+ text = node.xpath(xpath).text
148
+ if text.nil? or text.empty?
149
+ return nil
150
+ else
151
+ return text
152
+ end
153
+ else
154
+ nil
155
+ end
156
+ end
157
+
158
+ def container_level(node)
159
+ if node.parent['level']
160
+ node.parent['level']
161
+ else
162
+ nil
163
+ end
164
+ end
165
+
166
+ def series_number(node)
167
+ parent_node = node.parent
168
+ siblings = node.document.xpath('//xmlns:c01')
169
+ length = siblings.length
170
+ index = siblings.index(parent_node) + 1
171
+ index
172
+ end
173
+
174
+ def get_dsc
175
+ @dsc = @doc.xpath('//xmlns:dsc')
176
+ end
177
+
178
+ def get_eadxml
179
+ tries = 5
180
+ begin
181
+ if @ead_location.respond_to? :read
182
+ @ead_location.read
183
+ else
184
+ return open(@ead_location).read
185
+ end
186
+ rescue => e
187
+ tries -= 1
188
+ if tries > 0
189
+ retry
190
+ else
191
+ raise "Could not get record by eadid! " + e.inspect
192
+ end
193
+ end
194
+ end
195
+
196
+ end
197
+ end
198
+
@@ -0,0 +1,112 @@
1
+ module Mead
2
+ class Identifier
3
+
4
+ attr_accessor :mead, :eadid, :series, :container, :folder, :sequence, :page,
5
+ :ead_location, :metadata
6
+ include Mead::Validations
7
+ validates_format_of_mead
8
+ validates_presence_of_mead
9
+ validates_numericality_of_mead :sequence, :page
10
+
11
+ # If a location is given then extraction can take place
12
+ def initialize(mead, ead_location=nil)
13
+ @mead = mead
14
+ @metadata = nil
15
+ parse_mead 'eadid', 'series', 'container', 'folder', 'sequence'
16
+ @ead_location = parse_ead_location(ead_location)
17
+ split_container
18
+ split_folder
19
+ split_page
20
+ clean_zeros 'series', 'sequence', 'page'
21
+ self
22
+ end
23
+
24
+ def parse_mead(*args)
25
+ parts = @mead.split('-')
26
+ args.each_with_index do |field, i|
27
+ instance_variable_set('@' + field, parts[i])
28
+ end
29
+ end
30
+
31
+ def split_container
32
+ type = CONTAINER_MAPPING[ @container[0,2] ]
33
+ number = strip_zeros(container_number_transforms(@container[2,10]))
34
+ @container = {:type=> type, :number=> number}
35
+ end
36
+
37
+ def split_folder
38
+ if CONTAINER_MAPPING.keys.include?(@folder[0,2])
39
+ type = CONTAINER_MAPPING[ @folder[0,2] ]
40
+ number = strip_zeros(container_number_transforms(@folder[2,10]))
41
+ else
42
+ type = 'folder'
43
+ number = strip_zeros(container_number_transforms(@folder))
44
+ end
45
+ if number.nil? or (number and number.empty?)
46
+ @folder = nil
47
+ else
48
+ @folder = {:type=> type, :number=> number}
49
+ end
50
+ end
51
+
52
+ def container_number_transforms(string)
53
+ string.gsub('_','.').gsub('~', '-').gsub(/^0*/,'')
54
+ end
55
+
56
+ def clean_zeros(*args)
57
+ args.each do |field|
58
+ instance_var = instance_variable_get('@' + field)
59
+ if instance_var
60
+ cleaned_value = strip_zeros(instance_var)
61
+ instance_variable_set('@' + field, cleaned_value)
62
+ end
63
+ end
64
+ end
65
+
66
+ def strip_zeros(num)
67
+ num.sub(/^0+/,'')
68
+ end
69
+
70
+ def split_page
71
+ @sequence, @page = sequence.split('_')
72
+ end
73
+
74
+ def parse_ead_location(loc)
75
+ return nil if loc.nil?
76
+ if loc
77
+ if loc.is_a? File
78
+ loc.rewind if loc.eof?
79
+ @ead_location = loc
80
+ elsif loc.include?('http://')
81
+ if loc.include?(@eadid)
82
+ @ead_location = loc
83
+ else
84
+ @ead_location = File.join(loc, @eadid + '.xml')
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ def extract
91
+ @metadata = Mead::Extractor.new(self).extract
92
+ self
93
+ end
94
+
95
+ # def ead_has_series?
96
+ # if series > 1
97
+ # true
98
+ # else
99
+ # if
100
+ # false
101
+ # end
102
+ # end
103
+
104
+ # def replace_underscores(*args)
105
+ # args.each do |field|
106
+ # value = instance_variable_get('@' + field).gsub('_', '.')
107
+ # instance_variable_set('@' + field, value)
108
+ # end
109
+ # end
110
+
111
+ end
112
+ end
@@ -0,0 +1,38 @@
1
+ module Mead
2
+ module TrollopOptions
3
+ def check_options(opts)
4
+ number_of_get_methods = [:baseurl, :url, :file].inject(0) do |memo, option|
5
+ temp_memo = memo
6
+ temp_memo += 1 if opts[option]
7
+ temp_memo
8
+ end
9
+ if number_of_get_methods > 1
10
+ Trollop::die 'Must specify ONLY one way to get the Ead XML'
11
+ elsif number_of_get_methods == 0
12
+ Trollop::die 'Must specify at least one way to get the Ead XML'
13
+ end
14
+ end
15
+
16
+ def get_location_options(opts)
17
+ location_options = {}
18
+ if opts[:baseurl]
19
+ location_options[:baseurl] = opts[:baseurl]
20
+ elsif opts[:url]
21
+ location_options[:url] = opts[:url]
22
+ elsif opts[:file]
23
+ location_options[:file] = File.open(opts[:file])
24
+ end
25
+ location_options
26
+ end
27
+
28
+ def get_location(opts)
29
+ if opts[:baseurl]
30
+ opts[:baseurl]
31
+ elsif opts[:url]
32
+ opts[:url]
33
+ elsif opts[:file]
34
+ File.open(opts[:file])
35
+ end
36
+ end
37
+ end
38
+ end