athena 0.0.2.56 → 0.0.3.58
Sign up to get free protection for your applications and to get access to all the features.
- data/README +2 -2
- data/Rakefile +17 -16
- data/bin/athena +5 -5
- data/lib/athena.rb +6 -3
- data/lib/athena/formats.rb +40 -44
- data/lib/athena/formats/dbm.rb +22 -26
- data/lib/athena/formats/ferret.rb +91 -0
- data/lib/athena/formats/lingo.rb +69 -73
- data/lib/athena/formats/sisis.rb +36 -40
- data/lib/athena/formats/xml.rb +159 -163
- data/lib/athena/parser.rb +49 -53
- data/lib/athena/record.rb +50 -54
- data/lib/athena/util.rb +14 -18
- data/lib/athena/version.rb +16 -20
- metadata +8 -7
data/README
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
== VERSION
|
4
4
|
|
5
|
-
This documentation refers to athena version 0.0.
|
5
|
+
This documentation refers to athena version 0.0.3
|
6
6
|
|
7
7
|
|
8
8
|
== DESCRIPTION
|
@@ -17,7 +17,7 @@ TODO: well, the description... ;-)
|
|
17
17
|
|
18
18
|
== LICENSE AND COPYRIGHT
|
19
19
|
|
20
|
-
Copyright (C) 2007 University of Cologne,
|
20
|
+
Copyright (C) 2007-2008 University of Cologne,
|
21
21
|
Albertus-Magnus-Platz, 50932 Cologne, Germany
|
22
22
|
|
23
23
|
athena is free software: you can redistribute it and/or modify it under the
|
data/Rakefile
CHANGED
@@ -1,21 +1,22 @@
|
|
1
|
+
$:.unshift('lib')
|
2
|
+
require 'athena'
|
3
|
+
|
1
4
|
begin
|
2
5
|
require 'hen'
|
6
|
+
|
7
|
+
Hen.lay! {{
|
8
|
+
:rubyforge => {
|
9
|
+
:package => 'athena'
|
10
|
+
},
|
11
|
+
|
12
|
+
:gem => {
|
13
|
+
:version => Athena::VERSION,
|
14
|
+
:summary => 'Convert database files to various formats.',
|
15
|
+
:files => FileList['lib/**/*.rb', 'bin/*'].to_a,
|
16
|
+
:extra_files => FileList['[A-Z]*', 'example/*'].to_a,
|
17
|
+
:dependencies => %w[xmlstreamin ruby-nuggets]
|
18
|
+
}
|
19
|
+
}}
|
3
20
|
rescue LoadError
|
4
21
|
abort "Please install the 'hen' gem first."
|
5
22
|
end
|
6
|
-
|
7
|
-
require 'lib/athena/version'
|
8
|
-
|
9
|
-
Hen.lay! {{
|
10
|
-
:rubyforge => {
|
11
|
-
:package => 'athena'
|
12
|
-
},
|
13
|
-
|
14
|
-
:gem => {
|
15
|
-
:version => Athena::VERSION,
|
16
|
-
:summary => 'Convert database files to various formats.',
|
17
|
-
:files => FileList['lib/**/*.rb', 'bin/*'].to_a,
|
18
|
-
:extra_files => FileList['[A-Z]*', 'example/*'].to_a,
|
19
|
-
:dependencies => %w[xmlstreamin ruby-nuggets]
|
20
|
-
}
|
21
|
-
}}
|
data/bin/athena
CHANGED
@@ -5,9 +5,9 @@
|
|
5
5
|
# #
|
6
6
|
# athena -- Convert database files to various formats #
|
7
7
|
# #
|
8
|
-
# Copyright (C) 2007 University of Cologne,
|
9
|
-
#
|
10
|
-
#
|
8
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
9
|
+
# Albertus-Magnus-Platz, #
|
10
|
+
# 50932 Cologne, Germany #
|
11
11
|
# #
|
12
12
|
# Authors: #
|
13
13
|
# Jens Wille <jens.wille@uni-koeln.de> #
|
@@ -65,11 +65,11 @@ OptionParser.new { |opts|
|
|
65
65
|
opts.on('-i', '--input FILE', "Input file [Default: STDIN]") { |f|
|
66
66
|
abort "Can't find input file: #{f}." unless File.readable?(f)
|
67
67
|
|
68
|
-
options[:input] = File.open(f, 'r')
|
68
|
+
options[:input] = File.directory?(f) ? Dir.open(f) : File.open(f, 'r')
|
69
69
|
|
70
70
|
p = File.basename(f).split('.')
|
71
71
|
options[:spec_fallback] = p.last.downcase
|
72
|
-
options[:target_fallback] = p[0..-2].join('.')
|
72
|
+
options[:target_fallback] = p.size > 1 ? p[0..-2].join('.') : p.first
|
73
73
|
}
|
74
74
|
|
75
75
|
opts.on('-s', '--spec SPEC', "Input format (spec) [Default: file ending of <input-file>]") { |s|
|
data/lib/athena.rb
CHANGED
@@ -3,9 +3,9 @@
|
|
3
3
|
# #
|
4
4
|
# athena -- Convert database files to various formats #
|
5
5
|
# #
|
6
|
-
# Copyright (C) 2007 University of Cologne,
|
7
|
-
#
|
8
|
-
#
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
9
|
# #
|
10
10
|
# Authors: #
|
11
11
|
# Jens Wille <jens.wille@uni-koeln.de> #
|
@@ -35,6 +35,9 @@
|
|
35
35
|
# class method _convert_ supplied. This way, a specific format can even function
|
36
36
|
# as both input and output format.
|
37
37
|
|
38
|
+
module Athena
|
39
|
+
end
|
40
|
+
|
38
41
|
require 'athena/util'
|
39
42
|
require 'athena/parser'
|
40
43
|
require 'athena/record'
|
data/lib/athena/formats.rb
CHANGED
@@ -3,9 +3,9 @@
|
|
3
3
|
# #
|
4
4
|
# A component of athena, the database file converter. #
|
5
5
|
# #
|
6
|
-
# Copyright (C) 2007 University of Cologne,
|
7
|
-
#
|
8
|
-
#
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
9
|
# #
|
10
10
|
# Authors: #
|
11
11
|
# Jens Wille <jens.wille@uni-koeln.de> #
|
@@ -26,65 +26,61 @@
|
|
26
26
|
###############################################################################
|
27
27
|
#++
|
28
28
|
|
29
|
-
|
29
|
+
class Athena::Formats
|
30
30
|
|
31
|
-
|
31
|
+
@formats = { :in => {}, :out => {} }
|
32
32
|
|
33
|
-
|
33
|
+
class << self
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
Formats.instance_variable_get(:@formats)
|
39
|
-
end
|
40
|
-
|
41
|
-
def [](direction, format)
|
42
|
-
formats[direction][format]
|
43
|
-
end
|
44
|
-
|
45
|
-
def valid_format?(direction, format)
|
46
|
-
formats[direction].has_key?(format)
|
47
|
-
end
|
35
|
+
def formats
|
36
|
+
Athena::Formats.instance_variable_get(:@formats)
|
37
|
+
end
|
48
38
|
|
49
|
-
|
50
|
-
|
51
|
-
|
39
|
+
def [](direction, format)
|
40
|
+
formats[direction][format]
|
41
|
+
end
|
52
42
|
|
53
|
-
|
54
|
-
|
55
|
-
|
43
|
+
def valid_format?(direction, format)
|
44
|
+
formats[direction].has_key?(format)
|
45
|
+
end
|
56
46
|
|
57
|
-
|
47
|
+
def deferred?
|
48
|
+
false
|
49
|
+
end
|
58
50
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
"format already defined (#{direction}): #{format} = #{existing}"
|
63
|
-
end
|
51
|
+
def convert(*args)
|
52
|
+
raise NotImplementedError, 'must be defined by sub-class'
|
53
|
+
end
|
64
54
|
|
65
|
-
|
66
|
-
end
|
55
|
+
private
|
67
56
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
57
|
+
def register_format(direction, format)
|
58
|
+
if existing = formats[direction][format]
|
59
|
+
raise DuplicateFormatDefinitionError,
|
60
|
+
"format already defined (#{direction}): #{format} = #{existing}"
|
72
61
|
end
|
73
62
|
|
63
|
+
formats[direction][format] = self
|
74
64
|
end
|
75
65
|
|
76
|
-
def
|
77
|
-
|
66
|
+
def register_formats(direction, *formats)
|
67
|
+
formats.each { |format|
|
68
|
+
register_format(direction, format)
|
69
|
+
}
|
78
70
|
end
|
79
71
|
|
80
|
-
|
81
|
-
end
|
72
|
+
end
|
82
73
|
|
83
|
-
|
84
|
-
|
74
|
+
def parse(*args)
|
75
|
+
raise NotImplementedError, 'must be defined by sub-class'
|
76
|
+
end
|
85
77
|
|
78
|
+
class DuplicateFormatDefinitionError < StandardError
|
86
79
|
end
|
87
|
-
|
80
|
+
|
81
|
+
class FormatArgumentError < ArgumentError
|
82
|
+
end
|
83
|
+
|
88
84
|
end
|
89
85
|
|
90
86
|
Dir[__FILE__.sub(/\.rb$/, '/**/*.rb')].each { |rb|
|
data/lib/athena/formats/dbm.rb
CHANGED
@@ -3,9 +3,9 @@
|
|
3
3
|
# #
|
4
4
|
# A component of athena, the database file converter. #
|
5
5
|
# #
|
6
|
-
# Copyright (C) 2007 University of Cologne,
|
7
|
-
#
|
8
|
-
#
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
9
|
# #
|
10
10
|
# Authors: #
|
11
11
|
# Jens Wille <jens.wille@uni-koeln.de> #
|
@@ -28,41 +28,37 @@
|
|
28
28
|
|
29
29
|
require 'iconv'
|
30
30
|
|
31
|
-
|
31
|
+
class Athena::Formats
|
32
32
|
|
33
|
-
class Formats
|
33
|
+
class DBM < Athena::Formats
|
34
34
|
|
35
|
-
|
35
|
+
register_formats :out, 'dbm', 'midos'
|
36
36
|
|
37
|
-
|
37
|
+
CRLF = "\015\012"
|
38
38
|
|
39
|
-
|
39
|
+
ICONV_TO_LATIN1 = Iconv.new('latin1', 'utf-8')
|
40
40
|
|
41
|
-
|
41
|
+
VALUE_SEPARATOR = '|'
|
42
|
+
RECORD_SEPARATOR = '&&&'
|
42
43
|
|
43
|
-
|
44
|
-
|
44
|
+
def self.convert(record)
|
45
|
+
dbm = ["ID:#{record.id}"]
|
45
46
|
|
46
|
-
|
47
|
-
|
47
|
+
record.struct.each { |field, struct|
|
48
|
+
strings = struct[:elements].inject([]) { |array, element|
|
49
|
+
values = (struct[:values][element] || []).map { |v|
|
50
|
+
(v || '').strip.gsub(/(?:\r?\n)+/, ' ')
|
51
|
+
}.reject { |v| v.empty? }
|
48
52
|
|
49
|
-
|
50
|
-
strings = struct[:elements].inject([]) { |array, element|
|
51
|
-
values = (struct[:values][element] || []).map { |v|
|
52
|
-
(v || '').strip.gsub(/(?:\r?\n)+/, ' ')
|
53
|
-
}.reject { |v| v.empty? }
|
54
|
-
|
55
|
-
array << (values.empty? ? struct[:empty] : values.join(VALUE_SEPARATOR))
|
56
|
-
}
|
57
|
-
|
58
|
-
dbm << "#{field.to_s.upcase}:#{ICONV_TO_LATIN1.iconv(struct[:string] % strings)}"
|
53
|
+
array << (values.empty? ? struct[:empty] : values.join(VALUE_SEPARATOR))
|
59
54
|
}
|
60
55
|
|
61
|
-
dbm <<
|
56
|
+
dbm << "#{field.to_s.upcase}:#{ICONV_TO_LATIN1.iconv(struct[:string] % strings)}"
|
57
|
+
}
|
62
58
|
|
63
|
-
|
64
|
-
end
|
59
|
+
dbm << RECORD_SEPARATOR
|
65
60
|
|
61
|
+
dbm.join(CRLF) << CRLF << CRLF
|
66
62
|
end
|
67
63
|
|
68
64
|
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of athena, the database file converter. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# athena is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# athena is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with athena. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'rubygems'
|
30
|
+
require 'ferret'
|
31
|
+
|
32
|
+
class Athena::Formats
|
33
|
+
|
34
|
+
class Ferret < Athena::Formats
|
35
|
+
|
36
|
+
register_format :in, 'ferret'
|
37
|
+
|
38
|
+
attr_reader :record_element, :config, :parser, :match_all_query
|
39
|
+
|
40
|
+
def initialize(parser)
|
41
|
+
config = parser.config.dup
|
42
|
+
|
43
|
+
case @record_element = config.delete(:__record_element)
|
44
|
+
when String
|
45
|
+
# fine!
|
46
|
+
when nil
|
47
|
+
raise NoRecordElementError, 'no record element specified'
|
48
|
+
else
|
49
|
+
raise IllegalRecordElementError, "illegal record element #{@record_element}"
|
50
|
+
end
|
51
|
+
|
52
|
+
@config = config
|
53
|
+
@parser = parser
|
54
|
+
|
55
|
+
@match_all_query = ::Ferret::Search::MatchAllQuery.new
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse(source)
|
59
|
+
search_all(source) { |doc|
|
60
|
+
record = Athena::Record.new(parser.block, doc[record_element])
|
61
|
+
|
62
|
+
config.each { |element, field_config|
|
63
|
+
record.update(element, doc[element], field_config)
|
64
|
+
}
|
65
|
+
|
66
|
+
record.close
|
67
|
+
}
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def search_all(source)
|
73
|
+
index = ::Ferret::Index::Index.new(
|
74
|
+
:path => source.path,
|
75
|
+
:create_if_missing => false
|
76
|
+
).searcher
|
77
|
+
|
78
|
+
index.search_each(match_all_query, :limit => :all) { |doc_id, _|
|
79
|
+
yield index[doc_id] if block_given?
|
80
|
+
}
|
81
|
+
end
|
82
|
+
|
83
|
+
class NoRecordElementError < StandardError
|
84
|
+
end
|
85
|
+
|
86
|
+
class IllegalRecordElementError < StandardError
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
data/lib/athena/formats/lingo.rb
CHANGED
@@ -3,9 +3,9 @@
|
|
3
3
|
# #
|
4
4
|
# A component of athena, the database file converter. #
|
5
5
|
# #
|
6
|
-
# Copyright (C) 2007 University of Cologne,
|
7
|
-
#
|
8
|
-
#
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
9
|
# #
|
10
10
|
# Authors: #
|
11
11
|
# Jens Wille <jens.wille@uni-koeln.de> #
|
@@ -29,113 +29,109 @@
|
|
29
29
|
require 'iconv'
|
30
30
|
require 'enumerator'
|
31
31
|
|
32
|
-
|
32
|
+
class Athena::Formats
|
33
33
|
|
34
|
-
|
34
|
+
module Lingo
|
35
35
|
|
36
|
-
|
36
|
+
class Base < Athena::Formats
|
37
37
|
|
38
|
-
class
|
38
|
+
class << self
|
39
39
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
(v || '').strip.gsub(/(?:\r?\n)+/, ' ')
|
47
|
-
}.reject { |v| v.empty? }
|
48
|
-
}
|
40
|
+
def convert(record)
|
41
|
+
record.struct.inject([]) { |terms, (field, struct)|
|
42
|
+
terms << struct[:elements].inject([]) { |array, element|
|
43
|
+
array += (struct[:values][element] || []).map { |v|
|
44
|
+
(v || '').strip.gsub(/(?:\r?\n)+/, ' ')
|
45
|
+
}.reject { |v| v.empty? }
|
49
46
|
}
|
50
|
-
|
51
|
-
|
52
|
-
def deferred?
|
53
|
-
true
|
54
|
-
end
|
47
|
+
}
|
48
|
+
end
|
55
49
|
|
56
|
-
|
50
|
+
def deferred?
|
51
|
+
true
|
52
|
+
end
|
57
53
|
|
58
|
-
|
59
|
-
return true if block ? block[actual] : expected == actual
|
54
|
+
private
|
60
55
|
|
61
|
-
|
56
|
+
def check_number_of_arguments(expected, actual, blow = false, &block)
|
57
|
+
return true if block ? block[actual] : expected == actual
|
62
58
|
|
63
|
-
|
64
|
-
raise FormatArgumentError, msg
|
65
|
-
else
|
66
|
-
warn msg
|
67
|
-
return false
|
68
|
-
end
|
69
|
-
end
|
59
|
+
msg = "wrong number of arguments for #{self} (#{actual} for #{expected})"
|
70
60
|
|
71
|
-
|
72
|
-
|
61
|
+
if blow
|
62
|
+
raise FormatArgumentError, msg
|
63
|
+
else
|
64
|
+
warn msg
|
65
|
+
return false
|
73
66
|
end
|
67
|
+
end
|
74
68
|
|
69
|
+
def check_number_of_arguments!(expected, actual, &block)
|
70
|
+
check_number_of_arguments(expected, actual, true, &block)
|
75
71
|
end
|
76
72
|
|
77
73
|
end
|
78
74
|
|
79
|
-
|
80
|
-
class SingleWord < Athena::Formats::Lingo::Base
|
75
|
+
end
|
81
76
|
|
82
|
-
|
77
|
+
# "Nasenbär\n"
|
78
|
+
class SingleWord < Athena::Formats::Lingo::Base
|
83
79
|
|
84
|
-
|
85
|
-
super.flatten
|
86
|
-
end
|
80
|
+
register_formats :out, 'lingo/single_word'
|
87
81
|
|
82
|
+
def self.convert(record)
|
83
|
+
super.flatten
|
88
84
|
end
|
89
85
|
|
90
|
-
|
91
|
-
class KeyValue < Athena::Formats::Lingo::Base
|
86
|
+
end
|
92
87
|
|
93
|
-
|
88
|
+
# "John Vorhauer*Vorhauer, John\n"
|
89
|
+
class KeyValue < Athena::Formats::Lingo::Base
|
94
90
|
|
95
|
-
|
96
|
-
super.map { |terms|
|
97
|
-
next unless check_number_of_arguments(2, terms.size)
|
91
|
+
register_formats :out, 'lingo/key_value'
|
98
92
|
|
99
|
-
|
100
|
-
|
101
|
-
|
93
|
+
def self.convert(record)
|
94
|
+
super.map { |terms|
|
95
|
+
next unless check_number_of_arguments(2, terms.size)
|
102
96
|
|
97
|
+
terms.join('*')
|
98
|
+
}.compact
|
103
99
|
end
|
104
100
|
|
105
|
-
|
106
|
-
class WordClass < Athena::Formats::Lingo::Base
|
101
|
+
end
|
107
102
|
|
108
|
-
|
103
|
+
# "Essen,essen #v Essen #s Esse #s\n"
|
104
|
+
class WordClass < Athena::Formats::Lingo::Base
|
109
105
|
|
110
|
-
|
111
|
-
super.map { |terms|
|
112
|
-
next unless check_number_of_arguments('odd, > 1', terms.size) { |actual|
|
113
|
-
actual > 1 && actual % 2 == 1
|
114
|
-
}
|
106
|
+
register_formats :out, 'lingo/word_class'
|
115
107
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
108
|
+
def self.convert(record)
|
109
|
+
super.map { |terms|
|
110
|
+
next unless check_number_of_arguments('odd, > 1', terms.size) { |actual|
|
111
|
+
actual > 1 && actual % 2 == 1
|
112
|
+
}
|
121
113
|
|
114
|
+
[terms.shift, terms.to_enum(:each_slice, 2).map { |form, wc|
|
115
|
+
"#{form} ##{wc}"
|
116
|
+
}.join(' ')].join(',')
|
117
|
+
}.compact
|
122
118
|
end
|
123
119
|
|
124
|
-
|
125
|
-
class MultiValue < Athena::Formats::Lingo::Base
|
120
|
+
end
|
126
121
|
|
127
|
-
|
122
|
+
# "Fax;Faxkopie;Telefax\n"
|
123
|
+
class MultiValue < Athena::Formats::Lingo::Base
|
128
124
|
|
129
|
-
|
130
|
-
super.map { |terms|
|
131
|
-
next unless check_number_of_arguments('> 1', terms.size) { |actual|
|
132
|
-
actual > 1
|
133
|
-
}
|
125
|
+
register_formats :out, 'lingo/multi_value', 'lingo/multi_key'
|
134
126
|
|
135
|
-
|
136
|
-
|
137
|
-
|
127
|
+
def self.convert(record)
|
128
|
+
super.map { |terms|
|
129
|
+
next unless check_number_of_arguments('> 1', terms.size) { |actual|
|
130
|
+
actual > 1
|
131
|
+
}
|
138
132
|
|
133
|
+
terms.join(';')
|
134
|
+
}.compact
|
139
135
|
end
|
140
136
|
|
141
137
|
end
|