athena 0.0.2.56 → 0.0.3.58
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +2 -2
- data/Rakefile +17 -16
- data/bin/athena +5 -5
- data/lib/athena.rb +6 -3
- data/lib/athena/formats.rb +40 -44
- data/lib/athena/formats/dbm.rb +22 -26
- data/lib/athena/formats/ferret.rb +91 -0
- data/lib/athena/formats/lingo.rb +69 -73
- data/lib/athena/formats/sisis.rb +36 -40
- data/lib/athena/formats/xml.rb +159 -163
- data/lib/athena/parser.rb +49 -53
- data/lib/athena/record.rb +50 -54
- data/lib/athena/util.rb +14 -18
- data/lib/athena/version.rb +16 -20
- metadata +8 -7
data/README
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
== VERSION
|
4
4
|
|
5
|
-
This documentation refers to athena version 0.0.
|
5
|
+
This documentation refers to athena version 0.0.3
|
6
6
|
|
7
7
|
|
8
8
|
== DESCRIPTION
|
@@ -17,7 +17,7 @@ TODO: well, the description... ;-)
|
|
17
17
|
|
18
18
|
== LICENSE AND COPYRIGHT
|
19
19
|
|
20
|
-
Copyright (C) 2007 University of Cologne,
|
20
|
+
Copyright (C) 2007-2008 University of Cologne,
|
21
21
|
Albertus-Magnus-Platz, 50932 Cologne, Germany
|
22
22
|
|
23
23
|
athena is free software: you can redistribute it and/or modify it under the
|
data/Rakefile
CHANGED
@@ -1,21 +1,22 @@
|
|
1
|
+
$:.unshift('lib')
|
2
|
+
require 'athena'
|
3
|
+
|
1
4
|
begin
|
2
5
|
require 'hen'
|
6
|
+
|
7
|
+
Hen.lay! {{
|
8
|
+
:rubyforge => {
|
9
|
+
:package => 'athena'
|
10
|
+
},
|
11
|
+
|
12
|
+
:gem => {
|
13
|
+
:version => Athena::VERSION,
|
14
|
+
:summary => 'Convert database files to various formats.',
|
15
|
+
:files => FileList['lib/**/*.rb', 'bin/*'].to_a,
|
16
|
+
:extra_files => FileList['[A-Z]*', 'example/*'].to_a,
|
17
|
+
:dependencies => %w[xmlstreamin ruby-nuggets]
|
18
|
+
}
|
19
|
+
}}
|
3
20
|
rescue LoadError
|
4
21
|
abort "Please install the 'hen' gem first."
|
5
22
|
end
|
6
|
-
|
7
|
-
require 'lib/athena/version'
|
8
|
-
|
9
|
-
Hen.lay! {{
|
10
|
-
:rubyforge => {
|
11
|
-
:package => 'athena'
|
12
|
-
},
|
13
|
-
|
14
|
-
:gem => {
|
15
|
-
:version => Athena::VERSION,
|
16
|
-
:summary => 'Convert database files to various formats.',
|
17
|
-
:files => FileList['lib/**/*.rb', 'bin/*'].to_a,
|
18
|
-
:extra_files => FileList['[A-Z]*', 'example/*'].to_a,
|
19
|
-
:dependencies => %w[xmlstreamin ruby-nuggets]
|
20
|
-
}
|
21
|
-
}}
|
data/bin/athena
CHANGED
@@ -5,9 +5,9 @@
|
|
5
5
|
# #
|
6
6
|
# athena -- Convert database files to various formats #
|
7
7
|
# #
|
8
|
-
# Copyright (C) 2007 University of Cologne,
|
9
|
-
#
|
10
|
-
#
|
8
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
9
|
+
# Albertus-Magnus-Platz, #
|
10
|
+
# 50932 Cologne, Germany #
|
11
11
|
# #
|
12
12
|
# Authors: #
|
13
13
|
# Jens Wille <jens.wille@uni-koeln.de> #
|
@@ -65,11 +65,11 @@ OptionParser.new { |opts|
|
|
65
65
|
opts.on('-i', '--input FILE', "Input file [Default: STDIN]") { |f|
|
66
66
|
abort "Can't find input file: #{f}." unless File.readable?(f)
|
67
67
|
|
68
|
-
options[:input] = File.open(f, 'r')
|
68
|
+
options[:input] = File.directory?(f) ? Dir.open(f) : File.open(f, 'r')
|
69
69
|
|
70
70
|
p = File.basename(f).split('.')
|
71
71
|
options[:spec_fallback] = p.last.downcase
|
72
|
-
options[:target_fallback] = p[0..-2].join('.')
|
72
|
+
options[:target_fallback] = p.size > 1 ? p[0..-2].join('.') : p.first
|
73
73
|
}
|
74
74
|
|
75
75
|
opts.on('-s', '--spec SPEC', "Input format (spec) [Default: file ending of <input-file>]") { |s|
|
data/lib/athena.rb
CHANGED
@@ -3,9 +3,9 @@
|
|
3
3
|
# #
|
4
4
|
# athena -- Convert database files to various formats #
|
5
5
|
# #
|
6
|
-
# Copyright (C) 2007 University of Cologne,
|
7
|
-
#
|
8
|
-
#
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
9
|
# #
|
10
10
|
# Authors: #
|
11
11
|
# Jens Wille <jens.wille@uni-koeln.de> #
|
@@ -35,6 +35,9 @@
|
|
35
35
|
# class method _convert_ supplied. This way, a specific format can even function
|
36
36
|
# as both input and output format.
|
37
37
|
|
38
|
+
module Athena
|
39
|
+
end
|
40
|
+
|
38
41
|
require 'athena/util'
|
39
42
|
require 'athena/parser'
|
40
43
|
require 'athena/record'
|
data/lib/athena/formats.rb
CHANGED
@@ -3,9 +3,9 @@
|
|
3
3
|
# #
|
4
4
|
# A component of athena, the database file converter. #
|
5
5
|
# #
|
6
|
-
# Copyright (C) 2007 University of Cologne,
|
7
|
-
#
|
8
|
-
#
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
9
|
# #
|
10
10
|
# Authors: #
|
11
11
|
# Jens Wille <jens.wille@uni-koeln.de> #
|
@@ -26,65 +26,61 @@
|
|
26
26
|
###############################################################################
|
27
27
|
#++
|
28
28
|
|
29
|
-
|
29
|
+
class Athena::Formats
|
30
30
|
|
31
|
-
|
31
|
+
@formats = { :in => {}, :out => {} }
|
32
32
|
|
33
|
-
|
33
|
+
class << self
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
Formats.instance_variable_get(:@formats)
|
39
|
-
end
|
40
|
-
|
41
|
-
def [](direction, format)
|
42
|
-
formats[direction][format]
|
43
|
-
end
|
44
|
-
|
45
|
-
def valid_format?(direction, format)
|
46
|
-
formats[direction].has_key?(format)
|
47
|
-
end
|
35
|
+
def formats
|
36
|
+
Athena::Formats.instance_variable_get(:@formats)
|
37
|
+
end
|
48
38
|
|
49
|
-
|
50
|
-
|
51
|
-
|
39
|
+
def [](direction, format)
|
40
|
+
formats[direction][format]
|
41
|
+
end
|
52
42
|
|
53
|
-
|
54
|
-
|
55
|
-
|
43
|
+
def valid_format?(direction, format)
|
44
|
+
formats[direction].has_key?(format)
|
45
|
+
end
|
56
46
|
|
57
|
-
|
47
|
+
def deferred?
|
48
|
+
false
|
49
|
+
end
|
58
50
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
"format already defined (#{direction}): #{format} = #{existing}"
|
63
|
-
end
|
51
|
+
def convert(*args)
|
52
|
+
raise NotImplementedError, 'must be defined by sub-class'
|
53
|
+
end
|
64
54
|
|
65
|
-
|
66
|
-
end
|
55
|
+
private
|
67
56
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
57
|
+
def register_format(direction, format)
|
58
|
+
if existing = formats[direction][format]
|
59
|
+
raise DuplicateFormatDefinitionError,
|
60
|
+
"format already defined (#{direction}): #{format} = #{existing}"
|
72
61
|
end
|
73
62
|
|
63
|
+
formats[direction][format] = self
|
74
64
|
end
|
75
65
|
|
76
|
-
def
|
77
|
-
|
66
|
+
def register_formats(direction, *formats)
|
67
|
+
formats.each { |format|
|
68
|
+
register_format(direction, format)
|
69
|
+
}
|
78
70
|
end
|
79
71
|
|
80
|
-
|
81
|
-
end
|
72
|
+
end
|
82
73
|
|
83
|
-
|
84
|
-
|
74
|
+
def parse(*args)
|
75
|
+
raise NotImplementedError, 'must be defined by sub-class'
|
76
|
+
end
|
85
77
|
|
78
|
+
class DuplicateFormatDefinitionError < StandardError
|
86
79
|
end
|
87
|
-
|
80
|
+
|
81
|
+
class FormatArgumentError < ArgumentError
|
82
|
+
end
|
83
|
+
|
88
84
|
end
|
89
85
|
|
90
86
|
Dir[__FILE__.sub(/\.rb$/, '/**/*.rb')].each { |rb|
|
data/lib/athena/formats/dbm.rb
CHANGED
@@ -3,9 +3,9 @@
|
|
3
3
|
# #
|
4
4
|
# A component of athena, the database file converter. #
|
5
5
|
# #
|
6
|
-
# Copyright (C) 2007 University of Cologne,
|
7
|
-
#
|
8
|
-
#
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
9
|
# #
|
10
10
|
# Authors: #
|
11
11
|
# Jens Wille <jens.wille@uni-koeln.de> #
|
@@ -28,41 +28,37 @@
|
|
28
28
|
|
29
29
|
require 'iconv'
|
30
30
|
|
31
|
-
|
31
|
+
class Athena::Formats
|
32
32
|
|
33
|
-
class Formats
|
33
|
+
class DBM < Athena::Formats
|
34
34
|
|
35
|
-
|
35
|
+
register_formats :out, 'dbm', 'midos'
|
36
36
|
|
37
|
-
|
37
|
+
CRLF = "\015\012"
|
38
38
|
|
39
|
-
|
39
|
+
ICONV_TO_LATIN1 = Iconv.new('latin1', 'utf-8')
|
40
40
|
|
41
|
-
|
41
|
+
VALUE_SEPARATOR = '|'
|
42
|
+
RECORD_SEPARATOR = '&&&'
|
42
43
|
|
43
|
-
|
44
|
-
|
44
|
+
def self.convert(record)
|
45
|
+
dbm = ["ID:#{record.id}"]
|
45
46
|
|
46
|
-
|
47
|
-
|
47
|
+
record.struct.each { |field, struct|
|
48
|
+
strings = struct[:elements].inject([]) { |array, element|
|
49
|
+
values = (struct[:values][element] || []).map { |v|
|
50
|
+
(v || '').strip.gsub(/(?:\r?\n)+/, ' ')
|
51
|
+
}.reject { |v| v.empty? }
|
48
52
|
|
49
|
-
|
50
|
-
strings = struct[:elements].inject([]) { |array, element|
|
51
|
-
values = (struct[:values][element] || []).map { |v|
|
52
|
-
(v || '').strip.gsub(/(?:\r?\n)+/, ' ')
|
53
|
-
}.reject { |v| v.empty? }
|
54
|
-
|
55
|
-
array << (values.empty? ? struct[:empty] : values.join(VALUE_SEPARATOR))
|
56
|
-
}
|
57
|
-
|
58
|
-
dbm << "#{field.to_s.upcase}:#{ICONV_TO_LATIN1.iconv(struct[:string] % strings)}"
|
53
|
+
array << (values.empty? ? struct[:empty] : values.join(VALUE_SEPARATOR))
|
59
54
|
}
|
60
55
|
|
61
|
-
dbm <<
|
56
|
+
dbm << "#{field.to_s.upcase}:#{ICONV_TO_LATIN1.iconv(struct[:string] % strings)}"
|
57
|
+
}
|
62
58
|
|
63
|
-
|
64
|
-
end
|
59
|
+
dbm << RECORD_SEPARATOR
|
65
60
|
|
61
|
+
dbm.join(CRLF) << CRLF << CRLF
|
66
62
|
end
|
67
63
|
|
68
64
|
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of athena, the database file converter. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# athena is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# athena is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with athena. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'rubygems'
|
30
|
+
require 'ferret'
|
31
|
+
|
32
|
+
class Athena::Formats
|
33
|
+
|
34
|
+
class Ferret < Athena::Formats
|
35
|
+
|
36
|
+
register_format :in, 'ferret'
|
37
|
+
|
38
|
+
attr_reader :record_element, :config, :parser, :match_all_query
|
39
|
+
|
40
|
+
def initialize(parser)
|
41
|
+
config = parser.config.dup
|
42
|
+
|
43
|
+
case @record_element = config.delete(:__record_element)
|
44
|
+
when String
|
45
|
+
# fine!
|
46
|
+
when nil
|
47
|
+
raise NoRecordElementError, 'no record element specified'
|
48
|
+
else
|
49
|
+
raise IllegalRecordElementError, "illegal record element #{@record_element}"
|
50
|
+
end
|
51
|
+
|
52
|
+
@config = config
|
53
|
+
@parser = parser
|
54
|
+
|
55
|
+
@match_all_query = ::Ferret::Search::MatchAllQuery.new
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse(source)
|
59
|
+
search_all(source) { |doc|
|
60
|
+
record = Athena::Record.new(parser.block, doc[record_element])
|
61
|
+
|
62
|
+
config.each { |element, field_config|
|
63
|
+
record.update(element, doc[element], field_config)
|
64
|
+
}
|
65
|
+
|
66
|
+
record.close
|
67
|
+
}
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def search_all(source)
|
73
|
+
index = ::Ferret::Index::Index.new(
|
74
|
+
:path => source.path,
|
75
|
+
:create_if_missing => false
|
76
|
+
).searcher
|
77
|
+
|
78
|
+
index.search_each(match_all_query, :limit => :all) { |doc_id, _|
|
79
|
+
yield index[doc_id] if block_given?
|
80
|
+
}
|
81
|
+
end
|
82
|
+
|
83
|
+
class NoRecordElementError < StandardError
|
84
|
+
end
|
85
|
+
|
86
|
+
class IllegalRecordElementError < StandardError
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
data/lib/athena/formats/lingo.rb
CHANGED
@@ -3,9 +3,9 @@
|
|
3
3
|
# #
|
4
4
|
# A component of athena, the database file converter. #
|
5
5
|
# #
|
6
|
-
# Copyright (C) 2007 University of Cologne,
|
7
|
-
#
|
8
|
-
#
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
9
|
# #
|
10
10
|
# Authors: #
|
11
11
|
# Jens Wille <jens.wille@uni-koeln.de> #
|
@@ -29,113 +29,109 @@
|
|
29
29
|
require 'iconv'
|
30
30
|
require 'enumerator'
|
31
31
|
|
32
|
-
|
32
|
+
class Athena::Formats
|
33
33
|
|
34
|
-
|
34
|
+
module Lingo
|
35
35
|
|
36
|
-
|
36
|
+
class Base < Athena::Formats
|
37
37
|
|
38
|
-
class
|
38
|
+
class << self
|
39
39
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
(v || '').strip.gsub(/(?:\r?\n)+/, ' ')
|
47
|
-
}.reject { |v| v.empty? }
|
48
|
-
}
|
40
|
+
def convert(record)
|
41
|
+
record.struct.inject([]) { |terms, (field, struct)|
|
42
|
+
terms << struct[:elements].inject([]) { |array, element|
|
43
|
+
array += (struct[:values][element] || []).map { |v|
|
44
|
+
(v || '').strip.gsub(/(?:\r?\n)+/, ' ')
|
45
|
+
}.reject { |v| v.empty? }
|
49
46
|
}
|
50
|
-
|
51
|
-
|
52
|
-
def deferred?
|
53
|
-
true
|
54
|
-
end
|
47
|
+
}
|
48
|
+
end
|
55
49
|
|
56
|
-
|
50
|
+
def deferred?
|
51
|
+
true
|
52
|
+
end
|
57
53
|
|
58
|
-
|
59
|
-
return true if block ? block[actual] : expected == actual
|
54
|
+
private
|
60
55
|
|
61
|
-
|
56
|
+
def check_number_of_arguments(expected, actual, blow = false, &block)
|
57
|
+
return true if block ? block[actual] : expected == actual
|
62
58
|
|
63
|
-
|
64
|
-
raise FormatArgumentError, msg
|
65
|
-
else
|
66
|
-
warn msg
|
67
|
-
return false
|
68
|
-
end
|
69
|
-
end
|
59
|
+
msg = "wrong number of arguments for #{self} (#{actual} for #{expected})"
|
70
60
|
|
71
|
-
|
72
|
-
|
61
|
+
if blow
|
62
|
+
raise FormatArgumentError, msg
|
63
|
+
else
|
64
|
+
warn msg
|
65
|
+
return false
|
73
66
|
end
|
67
|
+
end
|
74
68
|
|
69
|
+
def check_number_of_arguments!(expected, actual, &block)
|
70
|
+
check_number_of_arguments(expected, actual, true, &block)
|
75
71
|
end
|
76
72
|
|
77
73
|
end
|
78
74
|
|
79
|
-
|
80
|
-
class SingleWord < Athena::Formats::Lingo::Base
|
75
|
+
end
|
81
76
|
|
82
|
-
|
77
|
+
# "Nasenbär\n"
|
78
|
+
class SingleWord < Athena::Formats::Lingo::Base
|
83
79
|
|
84
|
-
|
85
|
-
super.flatten
|
86
|
-
end
|
80
|
+
register_formats :out, 'lingo/single_word'
|
87
81
|
|
82
|
+
def self.convert(record)
|
83
|
+
super.flatten
|
88
84
|
end
|
89
85
|
|
90
|
-
|
91
|
-
class KeyValue < Athena::Formats::Lingo::Base
|
86
|
+
end
|
92
87
|
|
93
|
-
|
88
|
+
# "John Vorhauer*Vorhauer, John\n"
|
89
|
+
class KeyValue < Athena::Formats::Lingo::Base
|
94
90
|
|
95
|
-
|
96
|
-
super.map { |terms|
|
97
|
-
next unless check_number_of_arguments(2, terms.size)
|
91
|
+
register_formats :out, 'lingo/key_value'
|
98
92
|
|
99
|
-
|
100
|
-
|
101
|
-
|
93
|
+
def self.convert(record)
|
94
|
+
super.map { |terms|
|
95
|
+
next unless check_number_of_arguments(2, terms.size)
|
102
96
|
|
97
|
+
terms.join('*')
|
98
|
+
}.compact
|
103
99
|
end
|
104
100
|
|
105
|
-
|
106
|
-
class WordClass < Athena::Formats::Lingo::Base
|
101
|
+
end
|
107
102
|
|
108
|
-
|
103
|
+
# "Essen,essen #v Essen #s Esse #s\n"
|
104
|
+
class WordClass < Athena::Formats::Lingo::Base
|
109
105
|
|
110
|
-
|
111
|
-
super.map { |terms|
|
112
|
-
next unless check_number_of_arguments('odd, > 1', terms.size) { |actual|
|
113
|
-
actual > 1 && actual % 2 == 1
|
114
|
-
}
|
106
|
+
register_formats :out, 'lingo/word_class'
|
115
107
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
108
|
+
def self.convert(record)
|
109
|
+
super.map { |terms|
|
110
|
+
next unless check_number_of_arguments('odd, > 1', terms.size) { |actual|
|
111
|
+
actual > 1 && actual % 2 == 1
|
112
|
+
}
|
121
113
|
|
114
|
+
[terms.shift, terms.to_enum(:each_slice, 2).map { |form, wc|
|
115
|
+
"#{form} ##{wc}"
|
116
|
+
}.join(' ')].join(',')
|
117
|
+
}.compact
|
122
118
|
end
|
123
119
|
|
124
|
-
|
125
|
-
class MultiValue < Athena::Formats::Lingo::Base
|
120
|
+
end
|
126
121
|
|
127
|
-
|
122
|
+
# "Fax;Faxkopie;Telefax\n"
|
123
|
+
class MultiValue < Athena::Formats::Lingo::Base
|
128
124
|
|
129
|
-
|
130
|
-
super.map { |terms|
|
131
|
-
next unless check_number_of_arguments('> 1', terms.size) { |actual|
|
132
|
-
actual > 1
|
133
|
-
}
|
125
|
+
register_formats :out, 'lingo/multi_value', 'lingo/multi_key'
|
134
126
|
|
135
|
-
|
136
|
-
|
137
|
-
|
127
|
+
def self.convert(record)
|
128
|
+
super.map { |terms|
|
129
|
+
next unless check_number_of_arguments('> 1', terms.size) { |actual|
|
130
|
+
actual > 1
|
131
|
+
}
|
138
132
|
|
133
|
+
terms.join(';')
|
134
|
+
}.compact
|
139
135
|
end
|
140
136
|
|
141
137
|
end
|