blackwinter-athena 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,33 @@
1
+ = athena - Convert database files to various formats
2
+
3
+ == VERSION
4
+
5
+ This documentation refers to athena version 0.0.6
6
+
7
+
8
+ == DESCRIPTION
9
+
10
+ TODO: well, the description... ;-)
11
+
12
+
13
+ == AUTHORS
14
+
15
+ * Jens Wille <mailto:jens.wille@uni-koeln.de>
16
+
17
+
18
+ == LICENSE AND COPYRIGHT
19
+
20
+ Copyright (C) 2007-2008 University of Cologne,
21
+ Albertus-Magnus-Platz, 50932 Cologne, Germany
22
+
23
+ athena is free software: you can redistribute it and/or modify it under the
24
+ terms of the GNU General Public License as published by the Free Software
25
+ Foundation, either version 3 of the License, or (at your option) any later
26
+ version.
27
+
28
+ athena is distributed in the hope that it will be useful, but WITHOUT ANY
29
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
30
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
31
+
32
+ You should have received a copy of the GNU General Public License along with
33
+ athena. If not, see <http://www.gnu.org/licenses/>.
data/Rakefile ADDED
@@ -0,0 +1,22 @@
1
+ $:.unshift('lib')
2
+ require 'athena'
3
+
4
+ begin
5
+ require 'hen'
6
+
7
+ Hen.lay! {{
8
+ :rubyforge => {
9
+ :package => 'athena'
10
+ },
11
+
12
+ :gem => {
13
+ :version => Athena::VERSION,
14
+ :summary => 'Convert database files to various formats.',
15
+ :files => FileList['lib/**/*.rb', 'bin/*'].to_a,
16
+ :extra_files => FileList['[A-Z]*', 'example/*'].to_a,
17
+ :dependencies => %w[xmlstreamin ruby-nuggets]
18
+ }
19
+ }}
20
+ rescue LoadError
21
+ abort "Please install the 'hen' gem first."
22
+ end
data/bin/athena ADDED
@@ -0,0 +1,183 @@
1
+ #! /usr/bin/ruby
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # athena -- Convert database files to various formats #
7
+ # #
8
+ # Copyright (C) 2007-2008 University of Cologne, #
9
+ # Albertus-Magnus-Platz, #
10
+ # 50932 Cologne, Germany #
11
+ # #
12
+ # Authors: #
13
+ # Jens Wille <jens.wille@uni-koeln.de> #
14
+ # #
15
+ # athena is free software; you can redistribute it and/or modify it under the #
16
+ # terms of the GNU General Public License as published by the Free Software #
17
+ # Foundation; either version 3 of the License, or (at your option) any later #
18
+ # version. #
19
+ # #
20
+ # athena is distributed in the hope that it will be useful, but WITHOUT ANY #
21
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
22
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
23
+ # details. #
24
+ # #
25
+ # You should have received a copy of the GNU General Public License along #
26
+ # with athena. If not, see <http://www.gnu.org/licenses/>. #
27
+ # #
28
+ ###############################################################################
29
+ #++
30
+
31
+ require 'optparse'
32
+ require 'yaml'
33
+
34
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
35
+
36
+ require 'athena'
37
+
38
+ USAGE = "Usage: #{$0} [-h|--help] [options]"
39
+ abort USAGE if ARGV.empty?
40
+
41
+ # Global variable to handle verbosity
42
+ $Verbose = {}
43
+
44
+ options = {
45
+ :config => 'config.yaml',
46
+ :input => STDIN,
47
+ :output => STDOUT,
48
+ :target => nil
49
+ }
50
+
51
+ OptionParser.new { |opts|
52
+ opts.banner = USAGE
53
+
54
+ opts.separator ''
55
+ opts.separator 'Options:'
56
+
57
+ opts.on('-c', '--config YAML', "Config file [Default: #{options[:config]}#{' (currently not present)' unless File.readable?(options[:config])}]") { |f|
58
+ abort "Can't find config file: #{f}." unless File.readable?(f)
59
+
60
+ options[:config] = f
61
+ }
62
+
63
+ opts.separator ''
64
+
65
+ opts.on('-i', '--input FILE', "Input file [Default: STDIN]") { |f|
66
+ abort "Can't find input file: #{f}." unless File.readable?(f)
67
+
68
+ options[:input] = File.directory?(f) ? Dir.open(f) : File.open(f, 'r')
69
+
70
+ p = File.basename(f).split('.')
71
+ options[:spec_fallback] = p.last.downcase
72
+ options[:target_fallback] = p.size > 1 ? p[0..-2].join('.') : p.first
73
+ }
74
+
75
+ opts.on('-s', '--spec SPEC', "Input format (spec) [Default: file ending of <input-file>]") { |s|
76
+ options[:spec] = s.downcase
77
+ }
78
+
79
+ opts.on('-L', '--list-specs', "List available input formats (specs) and exit") {
80
+ puts "Available input formats (specs):"
81
+
82
+ formats = Athena.input_formats
83
+ max = formats.map { |a, _| a.length }.max
84
+ formats.each { |f, k|
85
+ puts " - %-#{max}s = %s" % [f, k]
86
+ }
87
+
88
+ exit 0
89
+ }
90
+
91
+ opts.separator ''
92
+
93
+ opts.on('-o', '--output FILE', "Output file [Default: STDOUT]") { |f|
94
+ options[:output] = File.open(f, 'w')
95
+
96
+ options[:format_fallback] = f.split('.').last.downcase
97
+ }
98
+
99
+ opts.on('-f', '--format FORMAT', "Output format [Default: file ending of <output-file>]") { |f|
100
+ options[:format] = f.downcase
101
+ }
102
+
103
+ opts.on('-l', '--list-formats', "List available output formats and exit") {
104
+ puts "Available output formats:"
105
+
106
+ formats = Athena.output_formats
107
+ max = formats.map { |a, _| a.length }.max
108
+ formats.each { |f, k|
109
+ puts " - %-#{max}s = %s" % [f, k]
110
+ }
111
+
112
+ exit 0
113
+ }
114
+
115
+ opts.separator ''
116
+
117
+ opts.on('-t', '--target ID', "Target whose config to use [Default: <input-file> minus file ending,", "plus '.<spec>', plus ':<format>' (reversely in turn)]") { |t|
118
+ options[:target] = t
119
+ }
120
+
121
+ opts.separator ''
122
+ opts.separator 'Generic options:'
123
+
124
+ opts.on('-v', '--verbose [WHAT]', "Be verbose about what's being done. Optional argument is a comma-separated", "list of what should be output, or 'all' [Default: 'all']") { |what|
125
+ if what.nil? || what == 'all'
126
+ $Verbose.default = true
127
+ else
128
+ what.split(',').each { |w|
129
+ $Verbose[w.to_sym] = true
130
+ }
131
+ end
132
+ }
133
+
134
+ opts.on('-h', '--help', 'Print this help message and exit') {
135
+ abort opts.to_s
136
+ }
137
+
138
+ opts.on('--version', 'Print program version and exit') {
139
+ abort "#{File.basename($0)} v#{Athena::VERSION}"
140
+ }
141
+ }.parse!
142
+
143
+ spec = options[:spec] || options[:spec_fallback]
144
+ abort "No input format (spec) specified and none could be inferred." unless spec
145
+ abort "Invalid input format (spec): #{spec}. Use '-L' to get a list of available specs." unless Athena.valid_input_format?(spec)
146
+
147
+ format = options[:format] || options[:format_fallback]
148
+ abort "No output format specified and none could be inferred." unless format
149
+ abort "Invalid output format: #{format}. Use '-l' to get a list of available formats." unless Athena.valid_output_format?(format)
150
+
151
+ yaml = YAML.load_file(options[:config])
152
+ if t = options[:target]
153
+ target = t
154
+ config = yaml[t.to_sym]
155
+ else
156
+ [options[:target_fallback] || 'generic', ".#{spec}", ":#{format}"].inject([]) { |s, t|
157
+ s << (s.last ? s.last + t : t)
158
+ }.reverse.find { |t|
159
+ target = t
160
+ config = yaml[t.to_sym]
161
+ }
162
+ end
163
+ abort "Config not found for target: #{target}." unless config
164
+
165
+ parser = Athena.parser(config, spec)
166
+
167
+ if Athena.deferred_output?(format)
168
+ records = parser.parse(options[:input])
169
+
170
+ records.map { |record|
171
+ record.to(format)
172
+ }.flatten.sort.uniq.each { |line|
173
+ options[:output].puts line
174
+ }
175
+ else
176
+ records = parser.parse(options[:input]) { |record|
177
+ options[:output].puts record.to(format)
178
+ }
179
+ end
180
+
181
+ Athena::Util.verbose(:count) do
182
+ spit records.size
183
+ end
@@ -0,0 +1,72 @@
1
+ :example:
2
+ :__record_element: "record"
3
+ :author: "author"
4
+ :title:
5
+ :elements:
6
+ - "title/main"
7
+ - "title/subtitle"
8
+ :string: "%s: %s"
9
+ :empty: ">>n/a<<"
10
+ :place:
11
+ :elements:
12
+ - "city"
13
+ - "country"
14
+ :separator: " / "
15
+ :multiple-fields-per-element:
16
+ :__record_element: "record"
17
+ :author:
18
+ :elements:
19
+ - "author"
20
+ - "city"
21
+ - "title/main"
22
+ :string: "author=%s (city=%s) [title/main=%s]"
23
+ :title:
24
+ :elements:
25
+ - "title/main"
26
+ - "title/subtitle"
27
+ - "author"
28
+ :string: "title/main=%s: title/subtitle=%s (author=%s)"
29
+ :empty: ">>n/a<<"
30
+ :place:
31
+ :elements:
32
+ - "city"
33
+ - "country"
34
+ - "title"
35
+ :string: "city=%s / country=%s (title=%s)"
36
+ :sisis-ex:
37
+ :__record_element: "0000" # KatalogNr
38
+ :author:
39
+ :elements:
40
+ - "0100" # VerfAnsetz
41
+ - "0101" # Verf_Ordn
42
+ :string: "%s (%s)"
43
+ :title:
44
+ :elements:
45
+ - "0331" # HST
46
+ - "0335" # HSTZusatz
47
+ - "0370" # Untertitstab
48
+ :string: "%s : %s [%s]"
49
+ :place: "2028" # ort2sb
50
+ :sisis-multiple-fields-per-element:
51
+ :__record_element: "0000"
52
+ :author:
53
+ :elements:
54
+ - "0100"
55
+ - "0101"
56
+ - "0331"
57
+ - "2028"
58
+ :string: "VerfAnsetz=%s (Verf_Ordn=%s) [HST=%s] / ort2sb=%s"
59
+ :title:
60
+ :elements:
61
+ - "0331"
62
+ - "0335"
63
+ - "0370"
64
+ - "0100"
65
+ :string: "HST=%s : HSTZusatz=%s [Untertitstab=%s] (VerfAnsetz=%s)"
66
+ :place:
67
+ :elements:
68
+ - "2028"
69
+ - "0335"
70
+ - "0370"
71
+ - "0100"
72
+ :string: "ort2sb=%s (HSTZusatz=%s [Untertitstab=%s] / VerfAnsetz=%s)"
@@ -0,0 +1,26 @@
1
+ <root>
2
+ <record>
3
+ <author>
4
+ <first>John Doe</first>
5
+ <second>John Q.</second>
6
+ <third>JJ</third>
7
+ </author>
8
+ <title>
9
+ <main>Just kiddin'</main>
10
+ <subtitle>heh?</subtitle>
11
+ </title>
12
+ <city>Nowhere</city>
13
+ <country>None</country>
14
+ </record>
15
+ <record>
16
+ <author>
17
+ Jane Doe
18
+ <separator />
19
+ JD
20
+ </author>
21
+ <title>
22
+ <main>No title</main>
23
+ </title>
24
+ <city>Nowhere</city>
25
+ </record>
26
+ </root>
@@ -0,0 +1,90 @@
1
+ 0000:3
2
+ 0001:000000003
3
+ 0002:02.05.2001
4
+ 0003:27.08.2002
5
+ 0015:ger; lat
6
+ 0036:m
7
+ 0100.001:[Pecka, Michael]
8
+ 0110.001:[Pieczek, Michael]
9
+ 0331:Denckwürdiges Geheimnuß
10
+ 0335:Teutsche Erklärung des Kupfferstücks
11
+ 0370.001:Dum tua privato fessus das lumina somno
12
+ 0370.002:Einer/ oder gar drey müssen für das Volck sterben
13
+ 0424:1620
14
+ 0425:[ca. 1620]
15
+ 0433:1 Bl.
16
+ 0434:1 Ill.; Radierung 19,5 x 27,5 cm(nach einem Kupferstich von Gaspar Dooms)
17
+ 0435:Satzspiegel 38 x 30 cm
18
+ 0440.001:[S.l.]
19
+ 0509:Verfasser ermittelt aus Nachweis
20
+ 0511:Erscheinungsjahr ermittelt aus Inhalt und Nachweis
21
+ 0527.001:Andere Ausgabe: Pfeffer, Maria
22
+ 0720.001:Politisches Flugblatt; Prokaiserliches Flugblatt
23
+ 0721.001:Dreissigjähriger Krieg; Böhmisch-Pfälzischer Krieg; Krieg; 1600-1650
24
+ 0722.001:Böhmen <Königreich>; Schlesien; Mähren; Lausitz; Bayern; Österreich; Prag
25
+ 0723.001:Ferdinand <Römisch-Deutsches Reich, Kaiser, II.>; Christus
26
+ 1105.001:Einbl. V,8 b-10
27
+ 1105.002:Einbl. V,60
28
+ 1125.001:Blattmaß 38 x 30 cm; Hinterklebt ; Klebereste
29
+ 1125.002:Blattmaß 30 x 31,5 cm; Fragment: unterer Textteil fehlt teilweise
30
+ 1145.001:ei
31
+ 1145.002:ei
32
+ 2005:Metallschnittrahmen; Metallschnittleiste als Spaltentrenner; Zahlen als Marginalien
33
+ 2006:Deutsche illustrierte Flugblätter des 16. und 17. Jahrhunderts . Hrsg. von Wolfgang Harms. Bd. 2
34
+ 2008:Denkwürdig Geheimnis Prophezeiung Zustand Böhmen Unwesen
35
+ 2018:Lateinischer Text in der Radierung als Beischriften; Im unteren Teil des Bildes Versform
36
+ 2023:¤ 61B2(Ferdinand <Römisch-Deutsches Reich, Kaiser, II.>)(+1)
37
+ 2028:BLA
38
+ 2029:001
39
+ 2030.001:http://zoom.bib-bvb.de/StyleServer/calcrgn?cat=einbl&item=/300000183_0_r.sid&wid=750&hei=500&style=bsb/einbl.xsl&plugin=false
40
+ 2030.002:http://zoom.bib-bvb.de/StyleServer/calcrgn?cat=einbl&item=/300000368_0_r.sid&wid=750&hei=500&style=bsb/einbl.xsl&plugin=false
41
+ 9999:
42
+ 0000:4
43
+ 0001:000000004
44
+ 0002:03.05.2001
45
+ 0003:29.08.2002
46
+ 0015:ger
47
+ 0036:m
48
+ 0331:Machometische Zanck= und Haderkatzen
49
+ 0370.001:UNser Planet Ist Machomet, Saturnisch Thier Sind wir allhier
50
+ 0424:1621
51
+ 0425:1621
52
+ 0433:1 Bl.
53
+ 0434:1 Ill.; Kupferstich 14 x 24 cm
54
+ 0435:Satzspiegel 30,5 x 24,5 cm
55
+ 0440.001:[S.l.]
56
+ 0720.001:Politische Flugblatt; Antiunionistisches Flugblatt
57
+ 0721.001:Dreissigjähriger Krieg; Böhmisch-Pfälzischer Krieg
58
+ 0723.001:Mu.hammad; Mohammed
59
+ 1105.001:Einbl. V,8 b-11
60
+ 1125.001:Blattmaß 32,5 x 26 cm; Hinterklebt
61
+ 1145.001:ei
62
+ 2005:Metallschnittleisten als Spaltentrenner; Textinitiale
63
+ 2006:Paas, John Roger: The German Political Broadsheet (1600 - 1700)
64
+ 2008:Mohammed; Zank; Hader; Katze
65
+ 2018:Versform; 4 Spalten
66
+ 2023:¤ 34B12 - Katze ¤ 25F(+51) - kämpfende Tiere; aggressive Beziehungen
67
+ 2029:002
68
+ 2030.001:http://zoom.bib-bvb.de/StyleServer/calcrgn?cat=einbl&item=/300000184_0_r.sid&wid=750&hei=500&style=bsb/einbl.xsl&plugin=false
69
+ 9999:
70
+ 0000:5
71
+ 0001:000000005
72
+ 0002:03.05.2001
73
+ 0003:29.08.2002
74
+ 0015:ger
75
+ 0036:m
76
+ 0331:Alles hatt Sein Zeydt
77
+ 0331.001:-- Hää?
78
+ 0424:1621
79
+ 0425:[1621]
80
+ 0433:1 Bl.
81
+ 0434:1 Ill.; 16 x 27,5 cm
82
+ 0435:Satspiegel 16 x 27,5 cm
83
+ 0440.001:[S.l.]
84
+ 0511:Erscheinungsjahr ermittelt aus Bildinhalt und Nachweis
85
+ 0511:Erscheinungsjahr ermittelt aus Bildinhalt und Nachweis
86
+ 0527.001:Andere Ausgabe: Einbl. V,8 b-8; Deutsche illustrierte Flugblätter des 16. und 17. Jahrhunderts
87
+ 2028:BLUB
88
+ 2029:003
89
+ 2030.001:http://zoom.bib-bvb.de/StyleServer/calcrgn?cat=einbl&item=/300000185_0_r.sid&wid=750&hei=500&style=bsb/einbl.xsl&plugin=false
90
+ 9999:
@@ -0,0 +1,66 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of athena, the database file converter. #
5
+ # #
6
+ # Copyright (C) 2007-2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # athena is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # athena is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with athena. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'iconv'
30
+
31
+ class Athena::Formats
32
+
33
+ class DBM < Athena::Formats
34
+
35
+ register_formats :out, 'dbm', 'midos'
36
+
37
+ CRLF = "\015\012"
38
+
39
+ ICONV_TO_LATIN1 = Iconv.new('latin1//TRANSLIT//IGNORE', 'utf-8')
40
+
41
+ VALUE_SEPARATOR = '|'
42
+ RECORD_SEPARATOR = '&&&'
43
+
44
+ def self.convert(record)
45
+ dbm = ["ID:#{record.id}"]
46
+
47
+ record.struct.each { |field, struct|
48
+ strings = struct[:elements].inject([]) { |array, element|
49
+ values = (struct[:values][element] || []).map { |v|
50
+ (v || '').strip.gsub(/(?:\r?\n)+/, ' ')
51
+ }.reject { |v| v.empty? }
52
+
53
+ array << (values.empty? ? struct[:empty] : values.join(VALUE_SEPARATOR))
54
+ }
55
+
56
+ dbm << "#{field.to_s.upcase}:#{ICONV_TO_LATIN1.iconv(struct[:string] % strings)}"
57
+ }
58
+
59
+ dbm << RECORD_SEPARATOR
60
+
61
+ dbm.join(CRLF) << CRLF << CRLF
62
+ end
63
+
64
+ end
65
+
66
+ end
@@ -0,0 +1,106 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of athena, the database file converter. #
5
+ # #
6
+ # Copyright (C) 2007-2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # athena is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # athena is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with athena. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'rubygems'
30
+
31
+ gem 'ferret', ENV['FERRET_VERSION'] if ENV['FERRET_VERSION']
32
+ require 'ferret'
33
+
34
+ class Athena::Formats
35
+
36
+ class Ferret < Athena::Formats
37
+
38
+ register_format :in, 'ferret'
39
+
40
+ attr_reader :record_element, :config, :parser, :match_all_query
41
+
42
+ def initialize(parser)
43
+ config = parser.config.dup
44
+
45
+ case @record_element = config.delete(:__record_element)
46
+ when String
47
+ # fine!
48
+ when nil
49
+ raise NoRecordElementError, 'no record element specified'
50
+ else
51
+ raise IllegalRecordElementError, "illegal record element #{@record_element}"
52
+ end
53
+
54
+ @config = config
55
+ @parser = parser
56
+ end
57
+
58
+ def parse(source)
59
+ path = source.path
60
+
61
+ # make sure the index can be opened
62
+ begin
63
+ File.open(File.join(path, 'segments')) {}
64
+ rescue Errno::ENOENT, Errno::EACCES => err
65
+ raise "can't open index at #{path} (#{err.to_s.sub(/ - .*/, '')})"
66
+ end
67
+
68
+ index = ::Ferret::Index::IndexReader.new(path)
69
+ first, last = 0, index.max_doc - 1
70
+
71
+ # make sure we can read from the index
72
+ begin
73
+ index[first]
74
+ index[last]
75
+ rescue StandardError # EOFError, "Not available", ...
76
+ raise "possible Ferret version mismatch; try to set the " <<
77
+ "FERRET_VERSION environment variable to something " <<
78
+ "other than #{Ferret::VERSION}"
79
+ end
80
+
81
+ first.upto(last) { |i|
82
+ unless index.deleted?(i)
83
+ doc = index[i]
84
+
85
+ Athena::Record.new(parser.block, doc[record_element]) { |record|
86
+ config.each { |element, field_config|
87
+ record.update(element, doc[element], field_config)
88
+ }
89
+ }
90
+ end
91
+ }
92
+
93
+ index.num_docs
94
+ end
95
+
96
+ private
97
+
98
+ class NoRecordElementError < StandardError
99
+ end
100
+
101
+ class IllegalRecordElementError < StandardError
102
+ end
103
+
104
+ end
105
+
106
+ end