marc_alephsequential 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ Yjk3YTZkMTY4Njg0NGY3ZTQ5ZGY2MzQ3MGIxMDA4NTdjMzEzZmNkNA==
5
+ data.tar.gz: !binary |-
6
+ NmMwMWQzMTIzMmQ3ZmNlOTM5YjRiMGFmZTI2MTA3NmM0Njk0ZTJiYg==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ NWE0M2NiM2M5MTcwOTJjNWJkN2JiMDk5OTJmNWIxYmVmNTg4NGY1ZTAxZGE1
10
+ OTM4ZDBhMTA3ZDZiYmY0YWU3M2I5NWM5MzBlYjhkY2QxNGZhNDdhMDAyM2Qx
11
+ MjQ4YWFiNTkxMjhmODk3Mzc0YjIzMGVlMzVmYzRlOTc0Y2Y3YmE=
12
+ data.tar.gz: !binary |-
13
+ OWZmOGVkYWVkZTM4ZGQ1YmM1NGMzNzBkM2IwY2I4NmZmMmI1OTYzN2FhMTQw
14
+ NDcwMDQ0Zjk3MzdiZWQxN2M1ZmQzMGRhYWFkNTg5NGFiNTY4Yjk4N2JiMTFi
15
+ ZWEyZjJkMThkY2RlM2VjYmZhNTQ1NmYxYzM5NDhlN2QyOGY3MGE=
data/.document ADDED
@@ -0,0 +1,3 @@
1
+ -
2
+ ChangeLog.md
3
+ LICENSE.txt
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ Gemfile.lock
2
+ doc/
3
+ pkg/
4
+ vendor/cache/*.gem
data/.travis.yml ADDED
@@ -0,0 +1,10 @@
1
+ language: ruby
2
+ bundler_args: --without documentation
3
+ rvm:
4
+ - 1.9.3
5
+ - 2.0.0
6
+ - jruby-19mode
7
+ branches:
8
+ only:
9
+ - master
10
+ - /^ci-.*$/
data/.yardopts ADDED
@@ -0,0 +1 @@
1
+ --markup markdown --title "marc_alephsequential Documentation" --protected
data/ChangeLog.md ADDED
@@ -0,0 +1,4 @@
1
+ ### 0.1.0 / 2013-08-12
2
+
3
+ * Initial release:
4
+
data/Gemfile ADDED
@@ -0,0 +1,18 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
3
+
4
+ gem 'marc'
5
+ gem 'yell'
6
+
7
+ group :test do
8
+ gem 'minitest'
9
+ gem "minitest-reporters", '>= 0.8.0'
10
+ gem 'minitest-colorize'
11
+ end
12
+
13
+ group :development do
14
+ gem 'kramdown'
15
+ gem 'bundler', '~> 1'
16
+ gem 'rake', '>= 1.0'
17
+ gem 'yard', '>= 0.8'
18
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2013 Bill Dueber
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,103 @@
1
+ # marc_alephsequential
2
+ [![Build Status](https://secure.travis-ci.org/billdueber/marc_alephsequential.png)](http://travis-ci.org/billdueber/marc_alephsequential)
3
+
4
+ A [ruby-marc](https://github.com/ruby-marc/ruby-marc) reader for MARC files in the Aleph sequential format
5
+
6
+ * [Homepage](https://github.com/billdueber/marc_alephsequential#readme)
7
+ * [Issues](https://github.com/billdueber/marc_alephsequential/issues)
8
+ * [Documentation](http://rubydoc.info/gems/marc_alephsequential/frames)
9
+ * [Email](mailto:bill at dueber.com)
10
+
11
+ ## Examples
12
+
13
+ ```ruby
14
+
15
+ require 'marc'
16
+ require 'marc_alephsequential'
17
+
18
+ log = GetALogFromSomewhere.new
19
+ # reader = MARC::AlephSequential::Reader.new('myfile.seq')
20
+ reader = MARC::AlephSequential::Reader.new('myfile.seq.gz') # automatically notice the .gz and behave!
21
+
22
+ reader.log = log # optional. Set up a logger; otherwise, a default logger will be used
23
+
24
+ begin
25
+ reader.each do |r|
26
+ # do stuff with the record
27
+ end
28
+ rescue MARC::AlephSequential::Error => e
29
+ log.error "Error while parsing record #{e.record_id} at/near #{e.line_number}: #{e.message}"
30
+ retry # may or may not work the way you'd hope/expect
31
+ rescue => e
32
+ log.error "Other error of some sort. quitting. #{e.message}"
33
+ end
34
+
35
+ ```
36
+
37
+ ## Description of the Aleph Sequential format
38
+
39
+ Aleph sequential is a MARC serialization format that is easily output by Ex Libris' Aleph software.
40
+ Each MARC record is presented as a series of unicode text lines, one field per line.
41
+
42
+
43
+ 000000228 LDR L ^^^^^nam^a22002891^^4500
44
+ 000000228 001 L 000000228
45
+ 000000228 006 L m^^^^^^^^d^^^^^^^^
46
+ 000000228 007 L cr^bn^---auaua
47
+ 000000228 008 L 880715r19691828nyuab^^^^^^^^|00000^eng^^
48
+ 000000228 010 L $$a68055188
49
+ 000000228 020 L $$a083711750X
50
+ 000000228 035 L $$a(RLIN)MIUG0021856-B
51
+ 000000794 24514 L $$aThe descent of manuscripts.
52
+ 000000794 60010 L $$aCicero, Marcus Tullius$$xManuscripts.
53
+ 000000794 60000 L $$aPlato.$$tCritias$$xManuscripts.
54
+
55
+ Each line has the following format (note: All must be in utf-8)
56
+
57
+ * 9 characters (all digits) for the aleph record ID
58
+ * [space]
59
+ * 3 character tag (left-justified / space padded if need be)
60
+ * 1 character indicator 1
61
+ * 1 character indicator 2
62
+ * [space L space], for some historic reasons I don't know
63
+ * The tag's value, perhaps with internal subfields
64
+
65
+ A record is defined as a set of continuous lines with the same record ID (i.e., the way you know you've finished with a record is because the record ID changes or you hit EOF).
66
+
67
+ ### How to read the Aleph sequential "value"
68
+
69
+ The leader and control fields have no internal structure, but spaces in the values are stored as '^' for some reason. (The reader, obviously, changes them back into spaces)
70
+
71
+ For data fields, the subfields are indicated as follows:
72
+
73
+ * A _subfield start marker_ (let's just say "SSM") matches /\$\$[a-z0-9]/ (e.g., $$a)
74
+ * The value string for a data field must start with an SSM
75
+ * An SSM marks the start of a subfield (and the end of the previous subfield, if any)
76
+
77
+ ### Obvious limitations of the Aleph sequential format
78
+
79
+ Actually, it's not all bad; I like it in a lot of ways. A little verbose at times, but easy to read for a human, and easy to write one-off scripts to run through a file and get statistics about use of tags, find a specific record (just match the bib ID at the beginning of the line), etc.
80
+
81
+ The easy-to-see problems are:
82
+
83
+ * fixed field size. Aleph has a lot of Cobol underneath. So if your bib ids don't happen to be nine characters, well, too bad.
84
+ * You can't have an embedded '$$' in a data field's value, because it will be interpreted as the start of a new subfield. '$$' isn't super common as a typo, but I've seen it.
85
+
86
+
87
+ ## Parse errors and automatic workarounds
88
+
89
+ * Lines that don't start with a nine-digit id will be assumed to be a part of the previous line that has an illegal spurious newline. The newline will be removed and all put back together again. If there is no "previous line" because it's the first line of the file, throw an error.
90
+ * Any completed record that doesn't include a leader (LDR) will throw an error
91
+ * Datafield values that don't start with '$$' will be logged as an error and assumed that the first set of data should be in subfield $$a
92
+
93
+
94
+
95
+ ## Install
96
+
97
+ $ gem install marc_alephsequential
98
+
99
+ ## Copyright
100
+
101
+ Copyright (c) 2013 Bill Dueber
102
+
103
+ See {file:LICENSE.txt} for details.
data/Rakefile ADDED
@@ -0,0 +1,38 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+
5
+ begin
6
+ require 'bundler'
7
+ rescue LoadError => e
8
+ warn e.message
9
+ warn "Run `gem install bundler` to install Bundler."
10
+ exit -1
11
+ end
12
+
13
+ begin
14
+ Bundler.setup(:development)
15
+ rescue Bundler::BundlerError => e
16
+ warn e.message
17
+ warn "Run `bundle install` to install missing gems."
18
+ exit e.status_code
19
+ end
20
+
21
+ require 'rake'
22
+
23
+ require "bundler/gem_tasks"
24
+
25
+ require 'yard'
26
+ YARD::Rake::YardocTask.new
27
+ task :doc => :yard
28
+
29
+ require 'rake/testtask'
30
+ Rake::TestTask.new do |t|
31
+ t.libs.push 'lib'
32
+ t.libs.push 'spec'
33
+ t.test_files = Dir.glob('spec/**/*_spec.rb')
34
+ end
35
+
36
+ task :spec => :test
37
+
38
+ task(:default => :test)
@@ -0,0 +1,161 @@
1
+ require 'marc'
2
+ require_relative 'error'
3
+ require_relative 'log'
4
+
5
+ module MARC
6
+ module AlephSequential
7
+
8
+
9
+ # A model of a line (field) in an alephsequential file.
10
+ class ASLine
11
+
12
+ include Log
13
+
14
+ # Characters in leader/control fields that need to be turned (back) into spaces
15
+ TURN_TO_SPACE = /\^/
16
+
17
+ # Pattern used to split data field values into subfield code/value pairs
18
+ SUBFIELD_SPLIT_PATTERN = /\$\$([a-zA-Z0-9])/
19
+
20
+ # How to know if we have a valid id? Must be 9 digits
21
+ VALID_ID = /^\d{9}$/
22
+
23
+ # The passed in raw string, used for post-processing later on
24
+ attr_accessor :rawstr
25
+
26
+ # The line number in the file/stream, for error reporting
27
+ attr_accessor :line_number
28
+
29
+ # Either the value of a control/fiexed field, or a string representation of a datafield's subfield
30
+ attr_accessor :value
31
+
32
+ # The type of field (:leader, :control, :data, or :invalid_id)
33
+ attr_accessor :type
34
+
35
+ attr_accessor :id, :tag, :ind1, :ind2
36
+
37
+ # The MARC field's tag
38
+ attr_reader :tag
39
+
40
+
41
+ # Given a raw string and a line number, construct the appropriate ASLine.
42
+ #
43
+ # @param [String] rawstr The raw string from the file
44
+ # @param [Number] line_number The line number from the file/stream, for error reporting
45
+
46
+ def initialize(rawstr, line_number)
47
+ @rawstr = rawstr.chomp
48
+ @line_number = line_number
49
+
50
+ (self.id,self.tag,self.ind1,self.ind2,self.value) = *(parseline(@rawstr))
51
+
52
+ # clean up the leader or fixed fields
53
+ if [:leader, :control].include? self.type
54
+ self.value = cleanup_fixed(self.value)
55
+ end
56
+
57
+ end
58
+
59
+ # Does this line have a valid (-looking) id?
60
+ def valid_id?
61
+ return VALID_ID.match(id) ? true : false
62
+ end
63
+
64
+
65
+ # Turn it into an actual MARC field (control or data)
66
+ # Throw an error if called on a leader (LDR) line
67
+ # @return [MARC::ControlField, MARC::DataField]
68
+ def to_field
69
+ case type
70
+ when :control
71
+ self.to_control_field
72
+ when :data
73
+ self.to_data_field
74
+ else
75
+ raise MARC::AlephSequential::Error.new(id, line_number ), "Tried to call #to_field on line type '#{self.type}'", nil
76
+ end
77
+ end
78
+
79
+ # Turn the current object into a control field, without doing any checks
80
+ # @return [MARC::ControlField]
81
+ def to_control_field
82
+ MARC::ControlField.new(tag, cleanup_fixed(self.value))
83
+ end
84
+
85
+ # Turn the current object into a datafield, without doing any checks
86
+ # @return [MARC::DataField]
87
+ def to_data_field
88
+ if self.value[0..1] != '$$'
89
+ log.error("#{self.line_number} #{self.id} Variable field #{self.tag} doesn't start with '$$'. Prepending '$$a'.")
90
+ self.value = '$$a' + self.value
91
+ end
92
+
93
+ subfields = parse_string_into_subfields(value)
94
+ f = MARC::DataField.new(tag, ind1, ind2)
95
+ f.subfields = subfields
96
+ return f
97
+ end
98
+
99
+ # Parse out a non-controlfield value string into a set of subfields
100
+ # @param [String] val the value string, of the form "$$athis is the a$$band the b"
101
+ # @return [Array<Subfield>] An array of MARC subfields
102
+ #
103
+ # If the first value in the array returned by the split isn't the empty string, then
104
+ # the string didn't start with '$$' and we should throw a warning
105
+ # (and put the value into a subfield 'a' if we're running in flexible mode)
106
+
107
+ def parse_string_into_subfields(val)
108
+ sfpairs = val.split(SUBFIELD_SPLIT_PATTERN)
109
+ initial_null_string = sfpairs.shift
110
+ unless initial_null_string == ''
111
+ # do something about the error
112
+ end
113
+
114
+ sfpairs.each_slice(2).map {|code, val| MARC::Subfield.new(code, val) }
115
+
116
+ end
117
+
118
+ # Clean up fixed fields/leader, turning Ex Libris characters back into normal characters
119
+ # @param [String] val The string to clean
120
+ # @return [String] The cleaned string
121
+ def cleanup_fixed(val)
122
+ return val.gsub(TURN_TO_SPACE, ' ')
123
+ end
124
+
125
+ # Set the tag. As a side effect, set the type when we set the tag
126
+ # type will end up as :leader, :control, :data, or :invalid_id
127
+ def tag=(t)
128
+ @tag = t
129
+ if t == 'LDR'
130
+ self.type = :leader
131
+ elsif MARC::ControlField.control_tag?(t)
132
+ self.type = :control
133
+ elsif self.valid_id?
134
+ self.type = :data
135
+ else
136
+ self.type = :invalid_id
137
+ end
138
+ end
139
+
140
+
141
+ # Get a line and parse it out into its componant parts
142
+ # @param [String] line the line to parse
143
+ # @return [Array] An array of the form [id, tag, ind1, ind2, value]
144
+
145
+ def parseline(line)
146
+ id = line[0,9]
147
+ tag = line[10,3]
148
+ ind1 = line[13,1]
149
+ ind2 = line[14,1]
150
+ value = line[18..-1]
151
+ return [id,tag,ind1,ind2,value]
152
+ end
153
+
154
+
155
+
156
+
157
+ end # ASLine
158
+ end
159
+ end
160
+
161
+
@@ -0,0 +1,101 @@
1
+ require_relative 'asline'
2
+ require_relative 'error'
3
+ require_relative 'log'
4
+
5
+
6
+ module MARC
7
+ module AlephSequential
8
+
9
+ # A group of ASLine objects with logic to correctly turn them into a MARC::Record object
10
+ # @see ASLine
11
+
12
+ class ASLineGroup
13
+
14
+ include Log
15
+
16
+ # @!attribute aslines
17
+ # @return [Array<MARC::Field>] Internal list of MARC field object
18
+ attr_accessor :aslines
19
+
20
+ # @!attribute [r] leader
21
+ # @return [String] The leader string, pulled from whatever was passed in with a LDR tag
22
+ attr_reader :leader
23
+
24
+
25
+
26
+ def initialize
27
+ @aslines = []
28
+ @leader = nil
29
+ end
30
+
31
+ # Number of aslines already added
32
+ # @return Integer
33
+ def size
34
+ aslines.size
35
+ end
36
+
37
+ # Is this group empty?
38
+ def empty?
39
+ aslines.empty?
40
+ end
41
+
42
+
43
+ # Add an ASLine object, turning it into the appropriate type of field as we go
44
+ # An ASLine object with type :invalid_id will be treated as a string and appended to
45
+ # the previous field (to deal with not-uncommon spurious newlines in data fields)
46
+ # @return [Undefined] side effect only
47
+ # @raise MARC::AlephSequential::Error when there's an invalid ID _and_ there's no previous
48
+ # field to concatentate it to.
49
+
50
+ def add(asline)
51
+ case asline.type
52
+ when :leader
53
+ if leader
54
+ log.warn("#{asline.line_number} #{asline.id} Set leader more than once; last one wins")
55
+ end
56
+ @leader = asline.value
57
+ when :invalid_id
58
+ lastfield = @aslines.pop
59
+ unless lastfield
60
+ raise MARC::AlephSequential::Error.new('unknown', asline.line_number),
61
+ "#{asline.line_number} has invalid id and no preivous line to concat it to (file starts bad?)"
62
+ nil
63
+ end
64
+ log.info "#{asline.line_number} #{lastfield.id} / #{lastfield.tag} Concatenating line #{asline.line_number} to previous line"
65
+ @aslines.push ASLine.new(lastfield.rawstr + asline.rawstr, lastfield.line_number)
66
+ else
67
+ @aslines.push asline
68
+ end
69
+ end
70
+
71
+
72
+ # Add an asline as a raw string
73
+ def add_string(asline_string, line_number)
74
+ self.add(ASLine.new(asline_string, line_number))
75
+ end
76
+
77
+ # Turn this object into a MARC::Record
78
+ # @return [MARC::Record]
79
+ # @raise MARC::AlephSequential::Error if this object is empty
80
+ # @raise MARC::AlephSequential::Error if there's no leader
81
+ def as_record
82
+ if empty?
83
+ raise MARC::AlephSequential::Error.new('unknown', 'unknown'), "Can't turn an empty group into a record", nil
84
+ end
85
+
86
+ unless leader
87
+ raise MARC::AlephSequential::Error.new(@aslines[0].id, @aslines[0].line_number),
88
+ "Record #{@aslines[0].id} (near line #{ @aslines[0].line_number}) has no leader; can't turn into a record",
89
+ nil
90
+ end
91
+ r = MARC::Record.new
92
+ r.leader = leader
93
+ aslines.map {|f| r << f.to_field}
94
+ return r
95
+ end
96
+
97
+ alias_method :to_record, :as_record
98
+ end
99
+ end
100
+ end
101
+
@@ -0,0 +1,16 @@
1
+ require_relative 'asline'
2
+ require_relative 'buffered_linereader'
3
+
4
+ module MARC
5
+ module AlephSequential
6
+
7
+ class ASLineReader < BufferedLineReader
8
+ def process_raw(raw, line_number)
9
+ super
10
+ ASLine.new(raw, line_number)
11
+ end
12
+
13
+ end
14
+ end
15
+ end
16
+
@@ -0,0 +1,100 @@
1
+ require 'zlib'
2
+
3
+ module MARC
4
+ module AlephSequential
5
+
6
+ # AlephSequential is a line-oriented format, with the first field of each line
7
+ # indicating the record number. Rather than try to screw around with keeping track of
8
+ # the last line read, checking to see if we have one, blah blah blah, I'm going to use
9
+ # a buffered line reader class so I can #peek at the next line to know if its id
10
+ # is different than the current record.
11
+
12
+ class BufferedLineReader
13
+
14
+ include Enumerable
15
+
16
+ attr_accessor :buffer_size
17
+ attr_reader :underlying_line_number
18
+
19
+ def initialize(filename_or_io)
20
+
21
+ @passed_in = filename_or_io
22
+
23
+ @underlying_line_number = 0
24
+ @buffer_size = 10
25
+ @buffer = []
26
+
27
+ if filename_or_io.is_a? String
28
+ @handle = File.open(filename_or_io, 'r:utf-8')
29
+ if filename_or_io =~ /\.gz$/
30
+ @handle = Zlib::GzipReader.new(@handle)
31
+ end
32
+ elsif filename_or_io.respond_to?("read", 5)
33
+ @handle = filename_or_io
34
+ else
35
+ raise ArgumentError.new("BufferedLineReader needs an IO object or filename, got #{filename_or_io} (#{filename_or_io.inspect})")
36
+ end
37
+
38
+ @iter = @handle.enum_for(:each_line)
39
+ @finished = false
40
+ # Fill up the buffer
41
+ self.fillbuffer
42
+ end
43
+
44
+ def has_next?
45
+ return !(@finished && @buffer.size == 0)
46
+ end
47
+
48
+ def fillbuffer(buffer_size = @buffer_size)
49
+ begin
50
+ buffer_size.times do
51
+ raw = @iter.next
52
+ @underlying_line_number += 1
53
+ @buffer.push process_raw(raw, @underlying_line_number)
54
+ end
55
+ rescue StopIteration
56
+ @finished = true
57
+ end
58
+ end
59
+
60
+ # Empty version here; can override for processing lines on the fly
61
+ def process_raw(raw, line_number)
62
+ raw
63
+ end
64
+
65
+ def next
66
+ raise StopIteration, "End of #{@passed_in}", nil if @buffer.size == 0
67
+ rv = @buffer.shift
68
+ fillbuffer if @buffer.size == 0
69
+ rv
70
+ end
71
+
72
+
73
+ def peek
74
+ fillbuffer unless @buffer.size > 0
75
+ @buffer[0]
76
+ end
77
+
78
+ def each
79
+ begin
80
+ while true
81
+ yield self.next
82
+ end
83
+ rescue StopIteration
84
+ end
85
+ end
86
+
87
+ alias_method :each_line, :each
88
+ end
89
+ end
90
+ end
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
@@ -0,0 +1,13 @@
1
+ module MARC
2
+ module AlephSequential
3
+
4
+ class Error < RuntimeError
5
+ attr_accessor :record_id, :line_number
6
+
7
+ def initialize(record_id, line_number)
8
+ @record_id = record_id
9
+ @line_number = line_number
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,20 @@
1
+ module MARC
2
+ module AlephSequential
3
+
4
+ module Log
5
+
6
+ def self.log
7
+ @log
8
+ end
9
+
10
+ def self.log=(log)
11
+ @log = log
12
+ end
13
+
14
+ def log
15
+ Log.log ||= Yell.new STDERR, :level => [ :warn, :error], :name=>'MAS'
16
+ Log.log
17
+ end
18
+ end
19
+ end
20
+ end