blackwinter-athena 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +676 -0
- data/ChangeLog +5 -0
- data/README +33 -0
- data/Rakefile +22 -0
- data/bin/athena +183 -0
- data/example/config.yaml +72 -0
- data/example/example.xml +26 -0
- data/example/sisis-ex.txt +90 -0
- data/lib/athena/formats/dbm.rb +66 -0
- data/lib/athena/formats/ferret.rb +106 -0
- data/lib/athena/formats/lingo.rb +141 -0
- data/lib/athena/formats/sisis.rb +79 -0
- data/lib/athena/formats/xml.rb +274 -0
- data/lib/athena/formats.rb +88 -0
- data/lib/athena/parser.rb +90 -0
- data/lib/athena/record.rb +107 -0
- data/lib/athena/util.rb +46 -0
- data/lib/athena/version.rb +51 -0
- data/lib/athena.rb +75 -0
- metadata +98 -0
data/README
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
= athena - Convert database files to various formats
|
2
|
+
|
3
|
+
== VERSION
|
4
|
+
|
5
|
+
This documentation refers to athena version 0.0.6
|
6
|
+
|
7
|
+
|
8
|
+
== DESCRIPTION
|
9
|
+
|
10
|
+
TODO: well, the description... ;-)
|
11
|
+
|
12
|
+
|
13
|
+
== AUTHORS
|
14
|
+
|
15
|
+
* Jens Wille <mailto:jens.wille@uni-koeln.de>
|
16
|
+
|
17
|
+
|
18
|
+
== LICENSE AND COPYRIGHT
|
19
|
+
|
20
|
+
Copyright (C) 2007-2008 University of Cologne,
|
21
|
+
Albertus-Magnus-Platz, 50932 Cologne, Germany
|
22
|
+
|
23
|
+
athena is free software: you can redistribute it and/or modify it under the
|
24
|
+
terms of the GNU General Public License as published by the Free Software
|
25
|
+
Foundation, either version 3 of the License, or (at your option) any later
|
26
|
+
version.
|
27
|
+
|
28
|
+
athena is distributed in the hope that it will be useful, but WITHOUT ANY
|
29
|
+
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
30
|
+
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
31
|
+
|
32
|
+
You should have received a copy of the GNU General Public License along with
|
33
|
+
athena. If not, see <http://www.gnu.org/licenses/>.
|
data/Rakefile
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
$:.unshift('lib')
|
2
|
+
require 'athena'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'hen'
|
6
|
+
|
7
|
+
Hen.lay! {{
|
8
|
+
:rubyforge => {
|
9
|
+
:package => 'athena'
|
10
|
+
},
|
11
|
+
|
12
|
+
:gem => {
|
13
|
+
:version => Athena::VERSION,
|
14
|
+
:summary => 'Convert database files to various formats.',
|
15
|
+
:files => FileList['lib/**/*.rb', 'bin/*'].to_a,
|
16
|
+
:extra_files => FileList['[A-Z]*', 'example/*'].to_a,
|
17
|
+
:dependencies => %w[xmlstreamin ruby-nuggets]
|
18
|
+
}
|
19
|
+
}}
|
20
|
+
rescue LoadError
|
21
|
+
abort "Please install the 'hen' gem first."
|
22
|
+
end
|
data/bin/athena
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# athena -- Convert database files to various formats #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
9
|
+
# Albertus-Magnus-Platz, #
|
10
|
+
# 50932 Cologne, Germany #
|
11
|
+
# #
|
12
|
+
# Authors: #
|
13
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
14
|
+
# #
|
15
|
+
# athena is free software; you can redistribute it and/or modify it under the #
|
16
|
+
# terms of the GNU General Public License as published by the Free Software #
|
17
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
18
|
+
# version. #
|
19
|
+
# #
|
20
|
+
# athena is distributed in the hope that it will be useful, but WITHOUT ANY #
|
21
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
22
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
23
|
+
# details. #
|
24
|
+
# #
|
25
|
+
# You should have received a copy of the GNU General Public License along #
|
26
|
+
# with athena. If not, see <http://www.gnu.org/licenses/>. #
|
27
|
+
# #
|
28
|
+
###############################################################################
|
29
|
+
#++
|
30
|
+
|
31
|
+
require 'optparse'
|
32
|
+
require 'yaml'
|
33
|
+
|
34
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
35
|
+
|
36
|
+
require 'athena'
|
37
|
+
|
38
|
+
USAGE = "Usage: #{$0} [-h|--help] [options]"
|
39
|
+
abort USAGE if ARGV.empty?
|
40
|
+
|
41
|
+
# Global variable to handle verbosity
|
42
|
+
$Verbose = {}
|
43
|
+
|
44
|
+
options = {
|
45
|
+
:config => 'config.yaml',
|
46
|
+
:input => STDIN,
|
47
|
+
:output => STDOUT,
|
48
|
+
:target => nil
|
49
|
+
}
|
50
|
+
|
51
|
+
OptionParser.new { |opts|
|
52
|
+
opts.banner = USAGE
|
53
|
+
|
54
|
+
opts.separator ''
|
55
|
+
opts.separator 'Options:'
|
56
|
+
|
57
|
+
opts.on('-c', '--config YAML', "Config file [Default: #{options[:config]}#{' (currently not present)' unless File.readable?(options[:config])}]") { |f|
|
58
|
+
abort "Can't find config file: #{f}." unless File.readable?(f)
|
59
|
+
|
60
|
+
options[:config] = f
|
61
|
+
}
|
62
|
+
|
63
|
+
opts.separator ''
|
64
|
+
|
65
|
+
opts.on('-i', '--input FILE', "Input file [Default: STDIN]") { |f|
|
66
|
+
abort "Can't find input file: #{f}." unless File.readable?(f)
|
67
|
+
|
68
|
+
options[:input] = File.directory?(f) ? Dir.open(f) : File.open(f, 'r')
|
69
|
+
|
70
|
+
p = File.basename(f).split('.')
|
71
|
+
options[:spec_fallback] = p.last.downcase
|
72
|
+
options[:target_fallback] = p.size > 1 ? p[0..-2].join('.') : p.first
|
73
|
+
}
|
74
|
+
|
75
|
+
opts.on('-s', '--spec SPEC', "Input format (spec) [Default: file ending of <input-file>]") { |s|
|
76
|
+
options[:spec] = s.downcase
|
77
|
+
}
|
78
|
+
|
79
|
+
opts.on('-L', '--list-specs', "List available input formats (specs) and exit") {
|
80
|
+
puts "Available input formats (specs):"
|
81
|
+
|
82
|
+
formats = Athena.input_formats
|
83
|
+
max = formats.map { |a, _| a.length }.max
|
84
|
+
formats.each { |f, k|
|
85
|
+
puts " - %-#{max}s = %s" % [f, k]
|
86
|
+
}
|
87
|
+
|
88
|
+
exit 0
|
89
|
+
}
|
90
|
+
|
91
|
+
opts.separator ''
|
92
|
+
|
93
|
+
opts.on('-o', '--output FILE', "Output file [Default: STDOUT]") { |f|
|
94
|
+
options[:output] = File.open(f, 'w')
|
95
|
+
|
96
|
+
options[:format_fallback] = f.split('.').last.downcase
|
97
|
+
}
|
98
|
+
|
99
|
+
opts.on('-f', '--format FORMAT', "Output format [Default: file ending of <output-file>]") { |f|
|
100
|
+
options[:format] = f.downcase
|
101
|
+
}
|
102
|
+
|
103
|
+
opts.on('-l', '--list-formats', "List available output formats and exit") {
|
104
|
+
puts "Available output formats:"
|
105
|
+
|
106
|
+
formats = Athena.output_formats
|
107
|
+
max = formats.map { |a, _| a.length }.max
|
108
|
+
formats.each { |f, k|
|
109
|
+
puts " - %-#{max}s = %s" % [f, k]
|
110
|
+
}
|
111
|
+
|
112
|
+
exit 0
|
113
|
+
}
|
114
|
+
|
115
|
+
opts.separator ''
|
116
|
+
|
117
|
+
opts.on('-t', '--target ID', "Target whose config to use [Default: <input-file> minus file ending,", "plus '.<spec>', plus ':<format>' (reversely in turn)]") { |t|
|
118
|
+
options[:target] = t
|
119
|
+
}
|
120
|
+
|
121
|
+
opts.separator ''
|
122
|
+
opts.separator 'Generic options:'
|
123
|
+
|
124
|
+
opts.on('-v', '--verbose [WHAT]', "Be verbose about what's being done. Optional argument is a comma-separated", "list of what should be output, or 'all' [Default: 'all']") { |what|
|
125
|
+
if what.nil? || what == 'all'
|
126
|
+
$Verbose.default = true
|
127
|
+
else
|
128
|
+
what.split(',').each { |w|
|
129
|
+
$Verbose[w.to_sym] = true
|
130
|
+
}
|
131
|
+
end
|
132
|
+
}
|
133
|
+
|
134
|
+
opts.on('-h', '--help', 'Print this help message and exit') {
|
135
|
+
abort opts.to_s
|
136
|
+
}
|
137
|
+
|
138
|
+
opts.on('--version', 'Print program version and exit') {
|
139
|
+
abort "#{File.basename($0)} v#{Athena::VERSION}"
|
140
|
+
}
|
141
|
+
}.parse!
|
142
|
+
|
143
|
+
spec = options[:spec] || options[:spec_fallback]
|
144
|
+
abort "No input format (spec) specified and none could be inferred." unless spec
|
145
|
+
abort "Invalid input format (spec): #{spec}. Use '-L' to get a list of available specs." unless Athena.valid_input_format?(spec)
|
146
|
+
|
147
|
+
format = options[:format] || options[:format_fallback]
|
148
|
+
abort "No output format specified and none could be inferred." unless format
|
149
|
+
abort "Invalid output format: #{format}. Use '-l' to get a list of available formats." unless Athena.valid_output_format?(format)
|
150
|
+
|
151
|
+
yaml = YAML.load_file(options[:config])
|
152
|
+
if t = options[:target]
|
153
|
+
target = t
|
154
|
+
config = yaml[t.to_sym]
|
155
|
+
else
|
156
|
+
[options[:target_fallback] || 'generic', ".#{spec}", ":#{format}"].inject([]) { |s, t|
|
157
|
+
s << (s.last ? s.last + t : t)
|
158
|
+
}.reverse.find { |t|
|
159
|
+
target = t
|
160
|
+
config = yaml[t.to_sym]
|
161
|
+
}
|
162
|
+
end
|
163
|
+
abort "Config not found for target: #{target}." unless config
|
164
|
+
|
165
|
+
parser = Athena.parser(config, spec)
|
166
|
+
|
167
|
+
if Athena.deferred_output?(format)
|
168
|
+
records = parser.parse(options[:input])
|
169
|
+
|
170
|
+
records.map { |record|
|
171
|
+
record.to(format)
|
172
|
+
}.flatten.sort.uniq.each { |line|
|
173
|
+
options[:output].puts line
|
174
|
+
}
|
175
|
+
else
|
176
|
+
records = parser.parse(options[:input]) { |record|
|
177
|
+
options[:output].puts record.to(format)
|
178
|
+
}
|
179
|
+
end
|
180
|
+
|
181
|
+
Athena::Util.verbose(:count) do
|
182
|
+
spit records.size
|
183
|
+
end
|
data/example/config.yaml
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
:example:
|
2
|
+
:__record_element: "record"
|
3
|
+
:author: "author"
|
4
|
+
:title:
|
5
|
+
:elements:
|
6
|
+
- "title/main"
|
7
|
+
- "title/subtitle"
|
8
|
+
:string: "%s: %s"
|
9
|
+
:empty: ">>n/a<<"
|
10
|
+
:place:
|
11
|
+
:elements:
|
12
|
+
- "city"
|
13
|
+
- "country"
|
14
|
+
:separator: " / "
|
15
|
+
:multiple-fields-per-element:
|
16
|
+
:__record_element: "record"
|
17
|
+
:author:
|
18
|
+
:elements:
|
19
|
+
- "author"
|
20
|
+
- "city"
|
21
|
+
- "title/main"
|
22
|
+
:string: "author=%s (city=%s) [title/main=%s]"
|
23
|
+
:title:
|
24
|
+
:elements:
|
25
|
+
- "title/main"
|
26
|
+
- "title/subtitle"
|
27
|
+
- "author"
|
28
|
+
:string: "title/main=%s: title/subtitle=%s (author=%s)"
|
29
|
+
:empty: ">>n/a<<"
|
30
|
+
:place:
|
31
|
+
:elements:
|
32
|
+
- "city"
|
33
|
+
- "country"
|
34
|
+
- "title"
|
35
|
+
:string: "city=%s / country=%s (title=%s)"
|
36
|
+
:sisis-ex:
|
37
|
+
:__record_element: "0000" # KatalogNr
|
38
|
+
:author:
|
39
|
+
:elements:
|
40
|
+
- "0100" # VerfAnsetz
|
41
|
+
- "0101" # Verf_Ordn
|
42
|
+
:string: "%s (%s)"
|
43
|
+
:title:
|
44
|
+
:elements:
|
45
|
+
- "0331" # HST
|
46
|
+
- "0335" # HSTZusatz
|
47
|
+
- "0370" # Untertitstab
|
48
|
+
:string: "%s : %s [%s]"
|
49
|
+
:place: "2028" # ort2sb
|
50
|
+
:sisis-multiple-fields-per-element:
|
51
|
+
:__record_element: "0000"
|
52
|
+
:author:
|
53
|
+
:elements:
|
54
|
+
- "0100"
|
55
|
+
- "0101"
|
56
|
+
- "0331"
|
57
|
+
- "2028"
|
58
|
+
:string: "VerfAnsetz=%s (Verf_Ordn=%s) [HST=%s] / ort2sb=%s"
|
59
|
+
:title:
|
60
|
+
:elements:
|
61
|
+
- "0331"
|
62
|
+
- "0335"
|
63
|
+
- "0370"
|
64
|
+
- "0100"
|
65
|
+
:string: "HST=%s : HSTZusatz=%s [Untertitstab=%s] (VerfAnsetz=%s)"
|
66
|
+
:place:
|
67
|
+
:elements:
|
68
|
+
- "2028"
|
69
|
+
- "0335"
|
70
|
+
- "0370"
|
71
|
+
- "0100"
|
72
|
+
:string: "ort2sb=%s (HSTZusatz=%s [Untertitstab=%s] / VerfAnsetz=%s)"
|
data/example/example.xml
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
<root>
|
2
|
+
<record>
|
3
|
+
<author>
|
4
|
+
<first>John Doe</first>
|
5
|
+
<second>John Q.</second>
|
6
|
+
<third>JJ</third>
|
7
|
+
</author>
|
8
|
+
<title>
|
9
|
+
<main>Just kiddin'</main>
|
10
|
+
<subtitle>heh?</subtitle>
|
11
|
+
</title>
|
12
|
+
<city>Nowhere</city>
|
13
|
+
<country>None</country>
|
14
|
+
</record>
|
15
|
+
<record>
|
16
|
+
<author>
|
17
|
+
Jane Doe
|
18
|
+
<separator />
|
19
|
+
JD
|
20
|
+
</author>
|
21
|
+
<title>
|
22
|
+
<main>No title</main>
|
23
|
+
</title>
|
24
|
+
<city>Nowhere</city>
|
25
|
+
</record>
|
26
|
+
</root>
|
@@ -0,0 +1,90 @@
|
|
1
|
+
0000:3
|
2
|
+
0001:000000003
|
3
|
+
0002:02.05.2001
|
4
|
+
0003:27.08.2002
|
5
|
+
0015:ger; lat
|
6
|
+
0036:m
|
7
|
+
0100.001:[Pecka, Michael]
|
8
|
+
0110.001:[Pieczek, Michael]
|
9
|
+
0331:Denckwürdiges Geheimnuß
|
10
|
+
0335:Teutsche Erklärung des Kupfferstücks
|
11
|
+
0370.001:Dum tua privato fessus das lumina somno
|
12
|
+
0370.002:Einer/ oder gar drey müssen für das Volck sterben
|
13
|
+
0424:1620
|
14
|
+
0425:[ca. 1620]
|
15
|
+
0433:1 Bl.
|
16
|
+
0434:1 Ill.; Radierung 19,5 x 27,5 cm(nach einem Kupferstich von Gaspar Dooms)
|
17
|
+
0435:Satzspiegel 38 x 30 cm
|
18
|
+
0440.001:[S.l.]
|
19
|
+
0509:Verfasser ermittelt aus Nachweis
|
20
|
+
0511:Erscheinungsjahr ermittelt aus Inhalt und Nachweis
|
21
|
+
0527.001:Andere Ausgabe: Pfeffer, Maria
|
22
|
+
0720.001:Politisches Flugblatt; Prokaiserliches Flugblatt
|
23
|
+
0721.001:Dreissigjähriger Krieg; Böhmisch-Pfälzischer Krieg; Krieg; 1600-1650
|
24
|
+
0722.001:Böhmen <Königreich>; Schlesien; Mähren; Lausitz; Bayern; Österreich; Prag
|
25
|
+
0723.001:Ferdinand <Römisch-Deutsches Reich, Kaiser, II.>; Christus
|
26
|
+
1105.001:Einbl. V,8 b-10
|
27
|
+
1105.002:Einbl. V,60
|
28
|
+
1125.001:Blattmaß 38 x 30 cm; Hinterklebt ; Klebereste
|
29
|
+
1125.002:Blattmaß 30 x 31,5 cm; Fragment: unterer Textteil fehlt teilweise
|
30
|
+
1145.001:ei
|
31
|
+
1145.002:ei
|
32
|
+
2005:Metallschnittrahmen; Metallschnittleiste als Spaltentrenner; Zahlen als Marginalien
|
33
|
+
2006:Deutsche illustrierte Flugblätter des 16. und 17. Jahrhunderts . Hrsg. von Wolfgang Harms. Bd. 2
|
34
|
+
2008:Denkwürdig Geheimnis Prophezeiung Zustand Böhmen Unwesen
|
35
|
+
2018:Lateinischer Text in der Radierung als Beischriften; Im unteren Teil des Bildes Versform
|
36
|
+
2023:¤ 61B2(Ferdinand <Römisch-Deutsches Reich, Kaiser, II.>)(+1)
|
37
|
+
2028:BLA
|
38
|
+
2029:001
|
39
|
+
2030.001:http://zoom.bib-bvb.de/StyleServer/calcrgn?cat=einbl&item=/300000183_0_r.sid&wid=750&hei=500&style=bsb/einbl.xsl&plugin=false
|
40
|
+
2030.002:http://zoom.bib-bvb.de/StyleServer/calcrgn?cat=einbl&item=/300000368_0_r.sid&wid=750&hei=500&style=bsb/einbl.xsl&plugin=false
|
41
|
+
9999:
|
42
|
+
0000:4
|
43
|
+
0001:000000004
|
44
|
+
0002:03.05.2001
|
45
|
+
0003:29.08.2002
|
46
|
+
0015:ger
|
47
|
+
0036:m
|
48
|
+
0331:Machometische Zanck= und Haderkatzen
|
49
|
+
0370.001:UNser Planet Ist Machomet, Saturnisch Thier Sind wir allhier
|
50
|
+
0424:1621
|
51
|
+
0425:1621
|
52
|
+
0433:1 Bl.
|
53
|
+
0434:1 Ill.; Kupferstich 14 x 24 cm
|
54
|
+
0435:Satzspiegel 30,5 x 24,5 cm
|
55
|
+
0440.001:[S.l.]
|
56
|
+
0720.001:Politische Flugblatt; Antiunionistisches Flugblatt
|
57
|
+
0721.001:Dreissigjähriger Krieg; Böhmisch-Pfälzischer Krieg
|
58
|
+
0723.001:Mu.hammad; Mohammed
|
59
|
+
1105.001:Einbl. V,8 b-11
|
60
|
+
1125.001:Blattmaß 32,5 x 26 cm; Hinterklebt
|
61
|
+
1145.001:ei
|
62
|
+
2005:Metallschnittleisten als Spaltentrenner; Textinitiale
|
63
|
+
2006:Paas, John Roger: The German Political Broadsheet (1600 - 1700)
|
64
|
+
2008:Mohammed; Zank; Hader; Katze
|
65
|
+
2018:Versform; 4 Spalten
|
66
|
+
2023:¤ 34B12 - Katze ¤ 25F(+51) - kämpfende Tiere; aggressive Beziehungen
|
67
|
+
2029:002
|
68
|
+
2030.001:http://zoom.bib-bvb.de/StyleServer/calcrgn?cat=einbl&item=/300000184_0_r.sid&wid=750&hei=500&style=bsb/einbl.xsl&plugin=false
|
69
|
+
9999:
|
70
|
+
0000:5
|
71
|
+
0001:000000005
|
72
|
+
0002:03.05.2001
|
73
|
+
0003:29.08.2002
|
74
|
+
0015:ger
|
75
|
+
0036:m
|
76
|
+
0331:Alles hatt Sein Zeydt
|
77
|
+
0331.001:-- Hää?
|
78
|
+
0424:1621
|
79
|
+
0425:[1621]
|
80
|
+
0433:1 Bl.
|
81
|
+
0434:1 Ill.; 16 x 27,5 cm
|
82
|
+
0435:Satspiegel 16 x 27,5 cm
|
83
|
+
0440.001:[S.l.]
|
84
|
+
0511:Erscheinungsjahr ermittelt aus Bildinhalt und Nachweis
|
85
|
+
0511:Erscheinungsjahr ermittelt aus Bildinhalt und Nachweis
|
86
|
+
0527.001:Andere Ausgabe: Einbl. V,8 b-8; Deutsche illustrierte Flugblätter des 16. und 17. Jahrhunderts
|
87
|
+
2028:BLUB
|
88
|
+
2029:003
|
89
|
+
2030.001:http://zoom.bib-bvb.de/StyleServer/calcrgn?cat=einbl&item=/300000185_0_r.sid&wid=750&hei=500&style=bsb/einbl.xsl&plugin=false
|
90
|
+
9999:
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of athena, the database file converter. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# athena is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# athena is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with athena. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'iconv'
|
30
|
+
|
31
|
+
class Athena::Formats
|
32
|
+
|
33
|
+
class DBM < Athena::Formats
|
34
|
+
|
35
|
+
register_formats :out, 'dbm', 'midos'
|
36
|
+
|
37
|
+
CRLF = "\015\012"
|
38
|
+
|
39
|
+
ICONV_TO_LATIN1 = Iconv.new('latin1//TRANSLIT//IGNORE', 'utf-8')
|
40
|
+
|
41
|
+
VALUE_SEPARATOR = '|'
|
42
|
+
RECORD_SEPARATOR = '&&&'
|
43
|
+
|
44
|
+
def self.convert(record)
|
45
|
+
dbm = ["ID:#{record.id}"]
|
46
|
+
|
47
|
+
record.struct.each { |field, struct|
|
48
|
+
strings = struct[:elements].inject([]) { |array, element|
|
49
|
+
values = (struct[:values][element] || []).map { |v|
|
50
|
+
(v || '').strip.gsub(/(?:\r?\n)+/, ' ')
|
51
|
+
}.reject { |v| v.empty? }
|
52
|
+
|
53
|
+
array << (values.empty? ? struct[:empty] : values.join(VALUE_SEPARATOR))
|
54
|
+
}
|
55
|
+
|
56
|
+
dbm << "#{field.to_s.upcase}:#{ICONV_TO_LATIN1.iconv(struct[:string] % strings)}"
|
57
|
+
}
|
58
|
+
|
59
|
+
dbm << RECORD_SEPARATOR
|
60
|
+
|
61
|
+
dbm.join(CRLF) << CRLF << CRLF
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of athena, the database file converter. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# athena is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# athena is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with athena. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'rubygems'
|
30
|
+
|
31
|
+
gem 'ferret', ENV['FERRET_VERSION'] if ENV['FERRET_VERSION']
|
32
|
+
require 'ferret'
|
33
|
+
|
34
|
+
class Athena::Formats
|
35
|
+
|
36
|
+
class Ferret < Athena::Formats
|
37
|
+
|
38
|
+
register_format :in, 'ferret'
|
39
|
+
|
40
|
+
attr_reader :record_element, :config, :parser, :match_all_query
|
41
|
+
|
42
|
+
def initialize(parser)
|
43
|
+
config = parser.config.dup
|
44
|
+
|
45
|
+
case @record_element = config.delete(:__record_element)
|
46
|
+
when String
|
47
|
+
# fine!
|
48
|
+
when nil
|
49
|
+
raise NoRecordElementError, 'no record element specified'
|
50
|
+
else
|
51
|
+
raise IllegalRecordElementError, "illegal record element #{@record_element}"
|
52
|
+
end
|
53
|
+
|
54
|
+
@config = config
|
55
|
+
@parser = parser
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse(source)
|
59
|
+
path = source.path
|
60
|
+
|
61
|
+
# make sure the index can be opened
|
62
|
+
begin
|
63
|
+
File.open(File.join(path, 'segments')) {}
|
64
|
+
rescue Errno::ENOENT, Errno::EACCES => err
|
65
|
+
raise "can't open index at #{path} (#{err.to_s.sub(/ - .*/, '')})"
|
66
|
+
end
|
67
|
+
|
68
|
+
index = ::Ferret::Index::IndexReader.new(path)
|
69
|
+
first, last = 0, index.max_doc - 1
|
70
|
+
|
71
|
+
# make sure we can read from the index
|
72
|
+
begin
|
73
|
+
index[first]
|
74
|
+
index[last]
|
75
|
+
rescue StandardError # EOFError, "Not available", ...
|
76
|
+
raise "possible Ferret version mismatch; try to set the " <<
|
77
|
+
"FERRET_VERSION environment variable to something " <<
|
78
|
+
"other than #{Ferret::VERSION}"
|
79
|
+
end
|
80
|
+
|
81
|
+
first.upto(last) { |i|
|
82
|
+
unless index.deleted?(i)
|
83
|
+
doc = index[i]
|
84
|
+
|
85
|
+
Athena::Record.new(parser.block, doc[record_element]) { |record|
|
86
|
+
config.each { |element, field_config|
|
87
|
+
record.update(element, doc[element], field_config)
|
88
|
+
}
|
89
|
+
}
|
90
|
+
end
|
91
|
+
}
|
92
|
+
|
93
|
+
index.num_docs
|
94
|
+
end
|
95
|
+
|
96
|
+
private
|
97
|
+
|
98
|
+
class NoRecordElementError < StandardError
|
99
|
+
end
|
100
|
+
|
101
|
+
class IllegalRecordElementError < StandardError
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|