blackwinter-athena 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,33 @@
1
+ = athena - Convert database files to various formats
2
+
3
+ == VERSION
4
+
5
+ This documentation refers to athena version 0.0.6
6
+
7
+
8
+ == DESCRIPTION
9
+
10
+ TODO: well, the description... ;-)
11
+
12
+
13
+ == AUTHORS
14
+
15
+ * Jens Wille <mailto:jens.wille@uni-koeln.de>
16
+
17
+
18
+ == LICENSE AND COPYRIGHT
19
+
20
+ Copyright (C) 2007-2008 University of Cologne,
21
+ Albertus-Magnus-Platz, 50932 Cologne, Germany
22
+
23
+ athena is free software: you can redistribute it and/or modify it under the
24
+ terms of the GNU General Public License as published by the Free Software
25
+ Foundation, either version 3 of the License, or (at your option) any later
26
+ version.
27
+
28
+ athena is distributed in the hope that it will be useful, but WITHOUT ANY
29
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
30
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
31
+
32
+ You should have received a copy of the GNU General Public License along with
33
+ athena. If not, see <http://www.gnu.org/licenses/>.
data/Rakefile ADDED
@@ -0,0 +1,22 @@
1
+ $:.unshift('lib')
2
+ require 'athena'
3
+
4
+ begin
5
+ require 'hen'
6
+
7
+ Hen.lay! {{
8
+ :rubyforge => {
9
+ :package => 'athena'
10
+ },
11
+
12
+ :gem => {
13
+ :version => Athena::VERSION,
14
+ :summary => 'Convert database files to various formats.',
15
+ :files => FileList['lib/**/*.rb', 'bin/*'].to_a,
16
+ :extra_files => FileList['[A-Z]*', 'example/*'].to_a,
17
+ :dependencies => %w[xmlstreamin ruby-nuggets]
18
+ }
19
+ }}
20
+ rescue LoadError
21
+ abort "Please install the 'hen' gem first."
22
+ end
data/bin/athena ADDED
@@ -0,0 +1,183 @@
1
+ #! /usr/bin/ruby
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # athena -- Convert database files to various formats #
7
+ # #
8
+ # Copyright (C) 2007-2008 University of Cologne, #
9
+ # Albertus-Magnus-Platz, #
10
+ # 50932 Cologne, Germany #
11
+ # #
12
+ # Authors: #
13
+ # Jens Wille <jens.wille@uni-koeln.de> #
14
+ # #
15
+ # athena is free software; you can redistribute it and/or modify it under the #
16
+ # terms of the GNU General Public License as published by the Free Software #
17
+ # Foundation; either version 3 of the License, or (at your option) any later #
18
+ # version. #
19
+ # #
20
+ # athena is distributed in the hope that it will be useful, but WITHOUT ANY #
21
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
22
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
23
+ # details. #
24
+ # #
25
+ # You should have received a copy of the GNU General Public License along #
26
+ # with athena. If not, see <http://www.gnu.org/licenses/>. #
27
+ # #
28
+ ###############################################################################
29
+ #++
30
+
31
+ require 'optparse'
32
+ require 'yaml'
33
+
34
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
35
+
36
+ require 'athena'
37
+
38
+ USAGE = "Usage: #{$0} [-h|--help] [options]"
39
+ abort USAGE if ARGV.empty?
40
+
41
+ # Global variable to handle verbosity
42
+ $Verbose = {}
43
+
44
+ options = {
45
+ :config => 'config.yaml',
46
+ :input => STDIN,
47
+ :output => STDOUT,
48
+ :target => nil
49
+ }
50
+
51
+ OptionParser.new { |opts|
52
+ opts.banner = USAGE
53
+
54
+ opts.separator ''
55
+ opts.separator 'Options:'
56
+
57
+ opts.on('-c', '--config YAML', "Config file [Default: #{options[:config]}#{' (currently not present)' unless File.readable?(options[:config])}]") { |f|
58
+ abort "Can't find config file: #{f}." unless File.readable?(f)
59
+
60
+ options[:config] = f
61
+ }
62
+
63
+ opts.separator ''
64
+
65
+ opts.on('-i', '--input FILE', "Input file [Default: STDIN]") { |f|
66
+ abort "Can't find input file: #{f}." unless File.readable?(f)
67
+
68
+ options[:input] = File.directory?(f) ? Dir.open(f) : File.open(f, 'r')
69
+
70
+ p = File.basename(f).split('.')
71
+ options[:spec_fallback] = p.last.downcase
72
+ options[:target_fallback] = p.size > 1 ? p[0..-2].join('.') : p.first
73
+ }
74
+
75
+ opts.on('-s', '--spec SPEC', "Input format (spec) [Default: file ending of <input-file>]") { |s|
76
+ options[:spec] = s.downcase
77
+ }
78
+
79
+ opts.on('-L', '--list-specs', "List available input formats (specs) and exit") {
80
+ puts "Available input formats (specs):"
81
+
82
+ formats = Athena.input_formats
83
+ max = formats.map { |a, _| a.length }.max
84
+ formats.each { |f, k|
85
+ puts " - %-#{max}s = %s" % [f, k]
86
+ }
87
+
88
+ exit 0
89
+ }
90
+
91
+ opts.separator ''
92
+
93
+ opts.on('-o', '--output FILE', "Output file [Default: STDOUT]") { |f|
94
+ options[:output] = File.open(f, 'w')
95
+
96
+ options[:format_fallback] = f.split('.').last.downcase
97
+ }
98
+
99
+ opts.on('-f', '--format FORMAT', "Output format [Default: file ending of <output-file>]") { |f|
100
+ options[:format] = f.downcase
101
+ }
102
+
103
+ opts.on('-l', '--list-formats', "List available output formats and exit") {
104
+ puts "Available output formats:"
105
+
106
+ formats = Athena.output_formats
107
+ max = formats.map { |a, _| a.length }.max
108
+ formats.each { |f, k|
109
+ puts " - %-#{max}s = %s" % [f, k]
110
+ }
111
+
112
+ exit 0
113
+ }
114
+
115
+ opts.separator ''
116
+
117
+ opts.on('-t', '--target ID', "Target whose config to use [Default: <input-file> minus file ending,", "plus '.<spec>', plus ':<format>' (reversely in turn)]") { |t|
118
+ options[:target] = t
119
+ }
120
+
121
+ opts.separator ''
122
+ opts.separator 'Generic options:'
123
+
124
+ opts.on('-v', '--verbose [WHAT]', "Be verbose about what's being done. Optional argument is a comma-separated", "list of what should be output, or 'all' [Default: 'all']") { |what|
125
+ if what.nil? || what == 'all'
126
+ $Verbose.default = true
127
+ else
128
+ what.split(',').each { |w|
129
+ $Verbose[w.to_sym] = true
130
+ }
131
+ end
132
+ }
133
+
134
+ opts.on('-h', '--help', 'Print this help message and exit') {
135
+ abort opts.to_s
136
+ }
137
+
138
+ opts.on('--version', 'Print program version and exit') {
139
+ abort "#{File.basename($0)} v#{Athena::VERSION}"
140
+ }
141
+ }.parse!
142
+
143
+ spec = options[:spec] || options[:spec_fallback]
144
+ abort "No input format (spec) specified and none could be inferred." unless spec
145
+ abort "Invalid input format (spec): #{spec}. Use '-L' to get a list of available specs." unless Athena.valid_input_format?(spec)
146
+
147
+ format = options[:format] || options[:format_fallback]
148
+ abort "No output format specified and none could be inferred." unless format
149
+ abort "Invalid output format: #{format}. Use '-l' to get a list of available formats." unless Athena.valid_output_format?(format)
150
+
151
+ yaml = YAML.load_file(options[:config])
152
+ if t = options[:target]
153
+ target = t
154
+ config = yaml[t.to_sym]
155
+ else
156
+ [options[:target_fallback] || 'generic', ".#{spec}", ":#{format}"].inject([]) { |s, t|
157
+ s << (s.last ? s.last + t : t)
158
+ }.reverse.find { |t|
159
+ target = t
160
+ config = yaml[t.to_sym]
161
+ }
162
+ end
163
+ abort "Config not found for target: #{target}." unless config
164
+
165
+ parser = Athena.parser(config, spec)
166
+
167
+ if Athena.deferred_output?(format)
168
+ records = parser.parse(options[:input])
169
+
170
+ records.map { |record|
171
+ record.to(format)
172
+ }.flatten.sort.uniq.each { |line|
173
+ options[:output].puts line
174
+ }
175
+ else
176
+ records = parser.parse(options[:input]) { |record|
177
+ options[:output].puts record.to(format)
178
+ }
179
+ end
180
+
181
+ Athena::Util.verbose(:count) do
182
+ spit records.size
183
+ end
@@ -0,0 +1,72 @@
1
+ :example:
2
+ :__record_element: "record"
3
+ :author: "author"
4
+ :title:
5
+ :elements:
6
+ - "title/main"
7
+ - "title/subtitle"
8
+ :string: "%s: %s"
9
+ :empty: ">>n/a<<"
10
+ :place:
11
+ :elements:
12
+ - "city"
13
+ - "country"
14
+ :separator: " / "
15
+ :multiple-fields-per-element:
16
+ :__record_element: "record"
17
+ :author:
18
+ :elements:
19
+ - "author"
20
+ - "city"
21
+ - "title/main"
22
+ :string: "author=%s (city=%s) [title/main=%s]"
23
+ :title:
24
+ :elements:
25
+ - "title/main"
26
+ - "title/subtitle"
27
+ - "author"
28
+ :string: "title/main=%s: title/subtitle=%s (author=%s)"
29
+ :empty: ">>n/a<<"
30
+ :place:
31
+ :elements:
32
+ - "city"
33
+ - "country"
34
+ - "title"
35
+ :string: "city=%s / country=%s (title=%s)"
36
+ :sisis-ex:
37
+ :__record_element: "0000" # KatalogNr
38
+ :author:
39
+ :elements:
40
+ - "0100" # VerfAnsetz
41
+ - "0101" # Verf_Ordn
42
+ :string: "%s (%s)"
43
+ :title:
44
+ :elements:
45
+ - "0331" # HST
46
+ - "0335" # HSTZusatz
47
+ - "0370" # Untertitstab
48
+ :string: "%s : %s [%s]"
49
+ :place: "2028" # ort2sb
50
+ :sisis-multiple-fields-per-element:
51
+ :__record_element: "0000"
52
+ :author:
53
+ :elements:
54
+ - "0100"
55
+ - "0101"
56
+ - "0331"
57
+ - "2028"
58
+ :string: "VerfAnsetz=%s (Verf_Ordn=%s) [HST=%s] / ort2sb=%s"
59
+ :title:
60
+ :elements:
61
+ - "0331"
62
+ - "0335"
63
+ - "0370"
64
+ - "0100"
65
+ :string: "HST=%s : HSTZusatz=%s [Untertitstab=%s] (VerfAnsetz=%s)"
66
+ :place:
67
+ :elements:
68
+ - "2028"
69
+ - "0335"
70
+ - "0370"
71
+ - "0100"
72
+ :string: "ort2sb=%s (HSTZusatz=%s [Untertitstab=%s] / VerfAnsetz=%s)"
@@ -0,0 +1,26 @@
1
+ <root>
2
+ <record>
3
+ <author>
4
+ <first>John Doe</first>
5
+ <second>John Q.</second>
6
+ <third>JJ</third>
7
+ </author>
8
+ <title>
9
+ <main>Just kiddin'</main>
10
+ <subtitle>heh?</subtitle>
11
+ </title>
12
+ <city>Nowhere</city>
13
+ <country>None</country>
14
+ </record>
15
+ <record>
16
+ <author>
17
+ Jane Doe
18
+ <separator />
19
+ JD
20
+ </author>
21
+ <title>
22
+ <main>No title</main>
23
+ </title>
24
+ <city>Nowhere</city>
25
+ </record>
26
+ </root>
@@ -0,0 +1,90 @@
1
+ 0000:3
2
+ 0001:000000003
3
+ 0002:02.05.2001
4
+ 0003:27.08.2002
5
+ 0015:ger; lat
6
+ 0036:m
7
+ 0100.001:[Pecka, Michael]
8
+ 0110.001:[Pieczek, Michael]
9
+ 0331:Denckwürdiges Geheimnuß
10
+ 0335:Teutsche Erklärung des Kupfferstücks
11
+ 0370.001:Dum tua privato fessus das lumina somno
12
+ 0370.002:Einer/ oder gar drey müssen für das Volck sterben
13
+ 0424:1620
14
+ 0425:[ca. 1620]
15
+ 0433:1 Bl.
16
+ 0434:1 Ill.; Radierung 19,5 x 27,5 cm(nach einem Kupferstich von Gaspar Dooms)
17
+ 0435:Satzspiegel 38 x 30 cm
18
+ 0440.001:[S.l.]
19
+ 0509:Verfasser ermittelt aus Nachweis
20
+ 0511:Erscheinungsjahr ermittelt aus Inhalt und Nachweis
21
+ 0527.001:Andere Ausgabe: Pfeffer, Maria
22
+ 0720.001:Politisches Flugblatt; Prokaiserliches Flugblatt
23
+ 0721.001:Dreissigjähriger Krieg; Böhmisch-Pfälzischer Krieg; Krieg; 1600-1650
24
+ 0722.001:Böhmen <Königreich>; Schlesien; Mähren; Lausitz; Bayern; Österreich; Prag
25
+ 0723.001:Ferdinand <Römisch-Deutsches Reich, Kaiser, II.>; Christus
26
+ 1105.001:Einbl. V,8 b-10
27
+ 1105.002:Einbl. V,60
28
+ 1125.001:Blattmaß 38 x 30 cm; Hinterklebt ; Klebereste
29
+ 1125.002:Blattmaß 30 x 31,5 cm; Fragment: unterer Textteil fehlt teilweise
30
+ 1145.001:ei
31
+ 1145.002:ei
32
+ 2005:Metallschnittrahmen; Metallschnittleiste als Spaltentrenner; Zahlen als Marginalien
33
+ 2006:Deutsche illustrierte Flugblätter des 16. und 17. Jahrhunderts . Hrsg. von Wolfgang Harms. Bd. 2
34
+ 2008:Denkwürdig Geheimnis Prophezeiung Zustand Böhmen Unwesen
35
+ 2018:Lateinischer Text in der Radierung als Beischriften; Im unteren Teil des Bildes Versform
36
+ 2023:¤ 61B2(Ferdinand <Römisch-Deutsches Reich, Kaiser, II.>)(+1)
37
+ 2028:BLA
38
+ 2029:001
39
+ 2030.001:http://zoom.bib-bvb.de/StyleServer/calcrgn?cat=einbl&item=/300000183_0_r.sid&wid=750&hei=500&style=bsb/einbl.xsl&plugin=false
40
+ 2030.002:http://zoom.bib-bvb.de/StyleServer/calcrgn?cat=einbl&item=/300000368_0_r.sid&wid=750&hei=500&style=bsb/einbl.xsl&plugin=false
41
+ 9999:
42
+ 0000:4
43
+ 0001:000000004
44
+ 0002:03.05.2001
45
+ 0003:29.08.2002
46
+ 0015:ger
47
+ 0036:m
48
+ 0331:Machometische Zanck= und Haderkatzen
49
+ 0370.001:UNser Planet Ist Machomet, Saturnisch Thier Sind wir allhier
50
+ 0424:1621
51
+ 0425:1621
52
+ 0433:1 Bl.
53
+ 0434:1 Ill.; Kupferstich 14 x 24 cm
54
+ 0435:Satzspiegel 30,5 x 24,5 cm
55
+ 0440.001:[S.l.]
56
+ 0720.001:Politische Flugblatt; Antiunionistisches Flugblatt
57
+ 0721.001:Dreissigjähriger Krieg; Böhmisch-Pfälzischer Krieg
58
+ 0723.001:Mu.hammad; Mohammed
59
+ 1105.001:Einbl. V,8 b-11
60
+ 1125.001:Blattmaß 32,5 x 26 cm; Hinterklebt
61
+ 1145.001:ei
62
+ 2005:Metallschnittleisten als Spaltentrenner; Textinitiale
63
+ 2006:Paas, John Roger: The German Political Broadsheet (1600 - 1700)
64
+ 2008:Mohammed; Zank; Hader; Katze
65
+ 2018:Versform; 4 Spalten
66
+ 2023:¤ 34B12 - Katze ¤ 25F(+51) - kämpfende Tiere; aggressive Beziehungen
67
+ 2029:002
68
+ 2030.001:http://zoom.bib-bvb.de/StyleServer/calcrgn?cat=einbl&item=/300000184_0_r.sid&wid=750&hei=500&style=bsb/einbl.xsl&plugin=false
69
+ 9999:
70
+ 0000:5
71
+ 0001:000000005
72
+ 0002:03.05.2001
73
+ 0003:29.08.2002
74
+ 0015:ger
75
+ 0036:m
76
+ 0331:Alles hatt Sein Zeydt
77
+ 0331.001:-- Hää?
78
+ 0424:1621
79
+ 0425:[1621]
80
+ 0433:1 Bl.
81
+ 0434:1 Ill.; 16 x 27,5 cm
82
+ 0435:Satspiegel 16 x 27,5 cm
83
+ 0440.001:[S.l.]
84
+ 0511:Erscheinungsjahr ermittelt aus Bildinhalt und Nachweis
85
+ 0511:Erscheinungsjahr ermittelt aus Bildinhalt und Nachweis
86
+ 0527.001:Andere Ausgabe: Einbl. V,8 b-8; Deutsche illustrierte Flugblätter des 16. und 17. Jahrhunderts
87
+ 2028:BLUB
88
+ 2029:003
89
+ 2030.001:http://zoom.bib-bvb.de/StyleServer/calcrgn?cat=einbl&item=/300000185_0_r.sid&wid=750&hei=500&style=bsb/einbl.xsl&plugin=false
90
+ 9999:
@@ -0,0 +1,66 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of athena, the database file converter. #
5
+ # #
6
+ # Copyright (C) 2007-2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # athena is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # athena is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with athena. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'iconv'
30
+
31
+ class Athena::Formats
32
+
33
+ class DBM < Athena::Formats
34
+
35
+ register_formats :out, 'dbm', 'midos'
36
+
37
+ CRLF = "\015\012"
38
+
39
+ ICONV_TO_LATIN1 = Iconv.new('latin1//TRANSLIT//IGNORE', 'utf-8')
40
+
41
+ VALUE_SEPARATOR = '|'
42
+ RECORD_SEPARATOR = '&&&'
43
+
44
+ def self.convert(record)
45
+ dbm = ["ID:#{record.id}"]
46
+
47
+ record.struct.each { |field, struct|
48
+ strings = struct[:elements].inject([]) { |array, element|
49
+ values = (struct[:values][element] || []).map { |v|
50
+ (v || '').strip.gsub(/(?:\r?\n)+/, ' ')
51
+ }.reject { |v| v.empty? }
52
+
53
+ array << (values.empty? ? struct[:empty] : values.join(VALUE_SEPARATOR))
54
+ }
55
+
56
+ dbm << "#{field.to_s.upcase}:#{ICONV_TO_LATIN1.iconv(struct[:string] % strings)}"
57
+ }
58
+
59
+ dbm << RECORD_SEPARATOR
60
+
61
+ dbm.join(CRLF) << CRLF << CRLF
62
+ end
63
+
64
+ end
65
+
66
+ end
@@ -0,0 +1,106 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of athena, the database file converter. #
5
+ # #
6
+ # Copyright (C) 2007-2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # athena is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # athena is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with athena. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'rubygems'
30
+
31
+ gem 'ferret', ENV['FERRET_VERSION'] if ENV['FERRET_VERSION']
32
+ require 'ferret'
33
+
34
+ class Athena::Formats
35
+
36
+ class Ferret < Athena::Formats
37
+
38
+ register_format :in, 'ferret'
39
+
40
+ attr_reader :record_element, :config, :parser, :match_all_query
41
+
42
+ def initialize(parser)
43
+ config = parser.config.dup
44
+
45
+ case @record_element = config.delete(:__record_element)
46
+ when String
47
+ # fine!
48
+ when nil
49
+ raise NoRecordElementError, 'no record element specified'
50
+ else
51
+ raise IllegalRecordElementError, "illegal record element #{@record_element}"
52
+ end
53
+
54
+ @config = config
55
+ @parser = parser
56
+ end
57
+
58
+ def parse(source)
59
+ path = source.path
60
+
61
+ # make sure the index can be opened
62
+ begin
63
+ File.open(File.join(path, 'segments')) {}
64
+ rescue Errno::ENOENT, Errno::EACCES => err
65
+ raise "can't open index at #{path} (#{err.to_s.sub(/ - .*/, '')})"
66
+ end
67
+
68
+ index = ::Ferret::Index::IndexReader.new(path)
69
+ first, last = 0, index.max_doc - 1
70
+
71
+ # make sure we can read from the index
72
+ begin
73
+ index[first]
74
+ index[last]
75
+ rescue StandardError # EOFError, "Not available", ...
76
+ raise "possible Ferret version mismatch; try to set the " <<
77
+ "FERRET_VERSION environment variable to something " <<
78
+ "other than #{Ferret::VERSION}"
79
+ end
80
+
81
+ first.upto(last) { |i|
82
+ unless index.deleted?(i)
83
+ doc = index[i]
84
+
85
+ Athena::Record.new(parser.block, doc[record_element]) { |record|
86
+ config.each { |element, field_config|
87
+ record.update(element, doc[element], field_config)
88
+ }
89
+ }
90
+ end
91
+ }
92
+
93
+ index.num_docs
94
+ end
95
+
96
+ private
97
+
98
+ class NoRecordElementError < StandardError
99
+ end
100
+
101
+ class IllegalRecordElementError < StandardError
102
+ end
103
+
104
+ end
105
+
106
+ end