ascii-data-tools 0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -0
- data/.rvmrc +1 -0
- data/.travis.yml +4 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +40 -0
- data/LICENSE.GPL2 +339 -0
- data/README.rdoc +52 -0
- data/Rakefile +42 -0
- data/TODO +4 -0
- data/ascii-data-tools.gemspec +30 -0
- data/bin/ascii-data-cat +13 -0
- data/bin/ascii-data-edit +13 -0
- data/bin/ascii-data-norm +13 -0
- data/bin/ascii-data-qdiff +13 -0
- data/bin/ascii-data-tools-config +9 -0
- data/examples/big +10000 -0
- data/examples/built_in_records.gz +0 -0
- data/examples/slightly_modified_built_in_records.gz +0 -0
- data/features/ascii-data-cat.feature +110 -0
- data/features/ascii-data-edit.feature +91 -0
- data/features/ascii-data-qdiff.feature +54 -0
- data/features/encoding_decoding.feature +68 -0
- data/features/normaliser.feature +27 -0
- data/features/plugins.feature +73 -0
- data/features/record_recognition.feature +61 -0
- data/features/step_definitions/ascii-data-cat_steps.rb +48 -0
- data/features/step_definitions/ascii-data-edit_steps.rb +38 -0
- data/features/step_definitions/ascii-data-norm_steps.rb +7 -0
- data/features/step_definitions/ascii-data-qdiff_steps.rb +43 -0
- data/features/step_definitions/encoding_decoding_steps.rb +23 -0
- data/features/step_definitions/plugins_steps.rb +11 -0
- data/features/step_definitions/record_recognition_steps.rb +10 -0
- data/features/support/env.rb +5 -0
- data/lib/ascii-data-tools.rb +8 -0
- data/lib/ascii-data-tools/configuration.rb +169 -0
- data/lib/ascii-data-tools/configuration_printer.rb +38 -0
- data/lib/ascii-data-tools/controller.rb +123 -0
- data/lib/ascii-data-tools/discover.rb +19 -0
- data/lib/ascii-data-tools/external_programs.rb +23 -0
- data/lib/ascii-data-tools/filter.rb +148 -0
- data/lib/ascii-data-tools/filter/diffing.rb +139 -0
- data/lib/ascii-data-tools/formatting.rb +109 -0
- data/lib/ascii-data-tools/global_autodiscovery.rb +21 -0
- data/lib/ascii-data-tools/record.rb +50 -0
- data/lib/ascii-data-tools/record_type.rb +139 -0
- data/lib/ascii-data-tools/record_type/builder.rb +50 -0
- data/lib/ascii-data-tools/record_type/decoder.rb +77 -0
- data/lib/ascii-data-tools/record_type/encoder.rb +17 -0
- data/lib/ascii-data-tools/record_type/field.rb +168 -0
- data/lib/ascii-data-tools/record_type/normaliser.rb +38 -0
- data/lib/ascii-data-tools/ruby_extensions.rb +7 -0
- data/lib/ascii-data-tools/version.rb +3 -0
- data/spec/ascii-data-tools/configuration_printer_spec.rb +51 -0
- data/spec/ascii-data-tools/configuration_spec.rb +153 -0
- data/spec/ascii-data-tools/discover_spec.rb +8 -0
- data/spec/ascii-data-tools/filter/diffing_spec.rb +82 -0
- data/spec/ascii-data-tools/filter_spec.rb +107 -0
- data/spec/ascii-data-tools/formatting_spec.rb +106 -0
- data/spec/ascii-data-tools/record_spec.rb +49 -0
- data/spec/ascii-data-tools/record_type/builder_spec.rb +69 -0
- data/spec/ascii-data-tools/record_type/decoder_spec.rb +73 -0
- data/spec/ascii-data-tools/record_type/encoder_spec.rb +32 -0
- data/spec/ascii-data-tools/record_type/field_spec.rb +160 -0
- data/spec/ascii-data-tools/record_type/normaliser_spec.rb +25 -0
- data/spec/ascii-data-tools/record_type_spec.rb +175 -0
- data/spec/filter_helper.rb +24 -0
- data/spec/record_type_helpers.rb +8 -0
- data/spec/spec.opts +2 -0
- data/spec/spec_helper.rb +5 -0
- metadata +196 -0
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module AsciiDataTools
|
4
|
+
module Record
|
5
|
+
class Record
|
6
|
+
attr_reader :type
|
7
|
+
|
8
|
+
def initialize(type, content_values)
|
9
|
+
@type = type
|
10
|
+
@values_by_type = {:content => content_values}
|
11
|
+
end
|
12
|
+
|
13
|
+
def [](requested_field_name)
|
14
|
+
requested_key_value_pair = field_names_and_values.detect {|field_name, value| field_name == requested_field_name }
|
15
|
+
raise "Field name '#{requested_field_name}' does not exist!" if requested_key_value_pair.nil?
|
16
|
+
requested_key_value_pair.last
|
17
|
+
end
|
18
|
+
|
19
|
+
def type_name
|
20
|
+
@type.name
|
21
|
+
end
|
22
|
+
|
23
|
+
def to_a
|
24
|
+
@type.field_names.zip(values)
|
25
|
+
end
|
26
|
+
|
27
|
+
def values
|
28
|
+
@values_by_type[:content]
|
29
|
+
end
|
30
|
+
|
31
|
+
def encode
|
32
|
+
@type.encode(values)
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_s
|
36
|
+
contents = field_names_and_values.map {|field_name, value| "#{field_name} => #{value.inspect}"}.join(", ")
|
37
|
+
"#{type_name}: #{contents}"
|
38
|
+
end
|
39
|
+
|
40
|
+
def ==(other)
|
41
|
+
self.type == other.type and self.values == other.values
|
42
|
+
end
|
43
|
+
|
44
|
+
protected
|
45
|
+
def field_names_and_values
|
46
|
+
@type.field_names.zip(values)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'forwardable'
|
3
|
+
require 'ascii-data-tools/record_type/field'
|
4
|
+
require 'ascii-data-tools/record_type/builder'
|
5
|
+
require 'ascii-data-tools/record_type/normaliser'
|
6
|
+
require 'ascii-data-tools/record_type/decoder'
|
7
|
+
require 'ascii-data-tools/record_type/encoder'
|
8
|
+
|
9
|
+
module AsciiDataTools
|
10
|
+
module RecordType
|
11
|
+
module FixedLengthType
|
12
|
+
include Decoder::FixedLengthRecordDecoder
|
13
|
+
include Encoder::FixedLengthRecordEncoder
|
14
|
+
include Normaliser::Normaliser
|
15
|
+
|
16
|
+
def total_length_of_fields
|
17
|
+
@total_length ||= fields.inject(0) {|sum, field| sum + field.length}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
module CsvType
|
22
|
+
include Decoder::CsvRecordDecoder
|
23
|
+
include Encoder::CsvRecordEncoder
|
24
|
+
end
|
25
|
+
|
26
|
+
class Type
|
27
|
+
extend Forwardable
|
28
|
+
attr_reader :name
|
29
|
+
|
30
|
+
def_delegator :fields, :names, :field_names
|
31
|
+
def_delegator :fields, :with_index, :field_with_index
|
32
|
+
|
33
|
+
def initialize(name, content_fields = Field::Fields.new)
|
34
|
+
@name = name
|
35
|
+
@fields_by_type = {:content => content_fields, :meta => make_meta_fields}
|
36
|
+
end
|
37
|
+
|
38
|
+
def field_with_name(name)
|
39
|
+
all_fields.with_name(name)
|
40
|
+
end
|
41
|
+
|
42
|
+
def method_missing(method_name, *args, &block)
|
43
|
+
content_fields.send(method_name, *args, &block)
|
44
|
+
end
|
45
|
+
|
46
|
+
def filename_should_match(value)
|
47
|
+
field_with_name(:filename).should_be_constrained_to(value)
|
48
|
+
end
|
49
|
+
|
50
|
+
protected
|
51
|
+
attr_reader :fields_by_type
|
52
|
+
|
53
|
+
def content_fields
|
54
|
+
@fields_by_type[:content]
|
55
|
+
end
|
56
|
+
|
57
|
+
alias :fields :content_fields
|
58
|
+
|
59
|
+
def make_meta_fields
|
60
|
+
Field::Fields.new([Field::Field.new(:filename), Field::ConstantField.new(:divider)])
|
61
|
+
end
|
62
|
+
|
63
|
+
def all_fields
|
64
|
+
Field::Fields.new(@fields_by_type[:content] + @fields_by_type[:meta])
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class UnknownType < Type
|
69
|
+
include Decoder::UnknownRecordDecoder
|
70
|
+
UNKNOWN_RECORD_TYPE_NAME = "unknown"
|
71
|
+
|
72
|
+
def initialize
|
73
|
+
super(UNKNOWN_RECORD_TYPE_NAME, Field::Fields.new([Field::Field.new("UNKNOWN")]))
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
class TypeDeterminer
|
78
|
+
def initialize(type_repo = RecordTypeRepository.new)
|
79
|
+
@all_types = type_repo
|
80
|
+
@previously_matched_types = RecordTypeRepository.new
|
81
|
+
end
|
82
|
+
|
83
|
+
def determine_type_for(encoded_record)
|
84
|
+
matching_type =
|
85
|
+
@previously_matched_types.identify_type_for(encoded_record) ||
|
86
|
+
@all_types.identify_type_for(encoded_record)
|
87
|
+
if matching_type.nil?
|
88
|
+
return UnknownType.new
|
89
|
+
else
|
90
|
+
@previously_matched_types << matching_type
|
91
|
+
return matching_type
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
class RecordTypeRepository
|
97
|
+
include Enumerable
|
98
|
+
include Builder::TypeBuilder
|
99
|
+
|
100
|
+
def initialize(types = [])
|
101
|
+
@types = Set.new(types)
|
102
|
+
end
|
103
|
+
|
104
|
+
def <<(type)
|
105
|
+
@types << type
|
106
|
+
end
|
107
|
+
|
108
|
+
def clear
|
109
|
+
@types.clear
|
110
|
+
end
|
111
|
+
|
112
|
+
def find_by_name(name)
|
113
|
+
detect {|type| type.name == name}
|
114
|
+
end
|
115
|
+
|
116
|
+
alias :type :find_by_name
|
117
|
+
|
118
|
+
def each(&block)
|
119
|
+
@types.each(&block)
|
120
|
+
end
|
121
|
+
|
122
|
+
def identify_type_for(encoded_record)
|
123
|
+
@types.detect {|type| type.able_to_decode?(encoded_record) }
|
124
|
+
end
|
125
|
+
|
126
|
+
def for_names_matching(matcher, &block)
|
127
|
+
if matcher.is_a?(Regexp)
|
128
|
+
select {|type| type.name =~ matcher}.each {|found_type| block[found_type]}
|
129
|
+
elsif matcher.is_a?(Proc)
|
130
|
+
select {|type| matcher[type.name]}.each {|found_type| block[found_type]}
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
def record_type(name, props = {}, &definition)
|
135
|
+
self << build_type(name, props, &definition)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'ascii-data-tools/record_type/field'
|
2
|
+
|
3
|
+
module AsciiDataTools
|
4
|
+
module RecordType
|
5
|
+
module Builder
|
6
|
+
module FieldBuilder
|
7
|
+
def build_field(name, properties = {})
|
8
|
+
field = Field::FixedLengthField.new(name, properties[:length])
|
9
|
+
field.should_be_constrained_to(properties[:constrained_to]) unless properties[:constrained_to].nil?
|
10
|
+
field.should_be_normalised if properties[:normalised]
|
11
|
+
field
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
module TypeBuilder
|
16
|
+
include FieldBuilder
|
17
|
+
def build_type(type_name, properties = {}, &block)
|
18
|
+
build_fields(&block)
|
19
|
+
type = Type.new(type_name, @fields)
|
20
|
+
|
21
|
+
type_family = determine_type_family_from(properties)
|
22
|
+
type.extend(type_family)
|
23
|
+
|
24
|
+
type.field_with_name(:filename).should_be_constrained_to(properties[:applies_for_filenames_matching])
|
25
|
+
type.field_with_name(:divider).value = properties[:divider]
|
26
|
+
type
|
27
|
+
end
|
28
|
+
|
29
|
+
def build_fields(&block)
|
30
|
+
@fields = Field::Fields.new
|
31
|
+
instance_eval(&block) unless block.nil?
|
32
|
+
@fields
|
33
|
+
end
|
34
|
+
|
35
|
+
protected
|
36
|
+
def field(name, properties = {})
|
37
|
+
@fields << build_field(name, properties)
|
38
|
+
end
|
39
|
+
|
40
|
+
def determine_type_family_from(properties)
|
41
|
+
case properties[:family]
|
42
|
+
when "csv" then CsvType
|
43
|
+
when "fixed_length" then FixedLengthType
|
44
|
+
when NilClass then FixedLengthType
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module AsciiDataTools
|
2
|
+
module RecordType
|
3
|
+
module Decoder
|
4
|
+
module RecordDecoder
|
5
|
+
def able_to_decode?(encoded_record)
|
6
|
+
able_to_decode_content?(encoded_record[:ascii_string]) and meta_fields_valid?(encoded_record)
|
7
|
+
end
|
8
|
+
|
9
|
+
def decode(encoded_record)
|
10
|
+
Record::Record.new(self, split_into_values(encoded_record[:ascii_string]))
|
11
|
+
end
|
12
|
+
|
13
|
+
protected
|
14
|
+
def able_to_decode_content?(encoded_string)
|
15
|
+
raise "Must be implemented in submodule!"
|
16
|
+
end
|
17
|
+
|
18
|
+
def split_into_values(ascii_string)
|
19
|
+
raise "Must be implemented in submodule!"
|
20
|
+
end
|
21
|
+
|
22
|
+
def meta_fields_valid?(encoded_record)
|
23
|
+
encoded_record[:filename].nil? or filename_field.valid_input?(encoded_record[:filename])
|
24
|
+
end
|
25
|
+
|
26
|
+
def filename_field
|
27
|
+
@filename_field ||= fields_by_type[:meta].with_name(:filename)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
module RegexpBasedDecoder
|
32
|
+
include RecordDecoder
|
33
|
+
|
34
|
+
protected
|
35
|
+
def able_to_decode_content?(encoded_string)
|
36
|
+
encoded_string =~ regexp_for_matching_type
|
37
|
+
end
|
38
|
+
|
39
|
+
def split_into_values(ascii_string)
|
40
|
+
ascii_string.match(regexp_for_matching_type).to_a[1..-1]
|
41
|
+
end
|
42
|
+
|
43
|
+
def regexp_for_matching_type
|
44
|
+
@regexp ||= Regexp.new(regexp_string, Regexp::MULTILINE)
|
45
|
+
end
|
46
|
+
|
47
|
+
def regexp_string
|
48
|
+
raise "Must be implemented in submodule!"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
module FixedLengthRecordDecoder
|
53
|
+
include RegexpBasedDecoder
|
54
|
+
|
55
|
+
protected
|
56
|
+
def regexp_string
|
57
|
+
content_fields.inject("\\A") {|regexp_string, field| field.extend_regexp_string_for_matching(regexp_string) } + "\\z"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
module CsvRecordDecoder
|
62
|
+
include RegexpBasedDecoder
|
63
|
+
|
64
|
+
protected
|
65
|
+
def regexp_string
|
66
|
+
"\\A" + "(.*?)" + ((field_with_name(:divider).value + "(.*?)") * (content_fields.length - 1)) + "\n\\z"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
module UnknownRecordDecoder
|
71
|
+
def decode(encoded_record)
|
72
|
+
Record::Record.new(self, [encoded_record[:ascii_string]])
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module AsciiDataTools
|
2
|
+
module RecordType
|
3
|
+
module Encoder
|
4
|
+
module FixedLengthRecordEncoder
|
5
|
+
def encode(values)
|
6
|
+
values.join
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
module CsvRecordEncoder
|
11
|
+
def encode(values)
|
12
|
+
values.join(field_with_name(:divider).value) + "\n"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,168 @@
|
|
1
|
+
module AsciiDataTools
|
2
|
+
module RecordType
|
3
|
+
module Field
|
4
|
+
module GreppableFields
|
5
|
+
def with_name(field_name)
|
6
|
+
self.detect {|field| field.name == field_name}
|
7
|
+
end
|
8
|
+
|
9
|
+
def fields_with(&block)
|
10
|
+
Fields.new(self.select(&block))
|
11
|
+
end
|
12
|
+
|
13
|
+
def with_index(index)
|
14
|
+
self[index-1]
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class Fields < Array
|
19
|
+
include GreppableFields
|
20
|
+
def names
|
21
|
+
self.collect {|f| f.name}
|
22
|
+
end
|
23
|
+
|
24
|
+
def number_of_content_fields
|
25
|
+
self.size
|
26
|
+
end
|
27
|
+
|
28
|
+
def length_of_longest_field_name
|
29
|
+
@length_of_longest_field_name ||= names.max_by {|name| name.length }.length
|
30
|
+
end
|
31
|
+
|
32
|
+
def constraints_description
|
33
|
+
self.reject {|field| field.constraint_description.empty? }.map {|field| field.constraint_description}.join(", ")
|
34
|
+
end
|
35
|
+
|
36
|
+
def should_be_normalised
|
37
|
+
self.each {|field| field.should_be_normalised}
|
38
|
+
end
|
39
|
+
|
40
|
+
def names_of_normalised_fields
|
41
|
+
self.select {|field| field.normalised?}.map {|field| field.name}.join(", ")
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
class Field
|
46
|
+
attr_reader :name
|
47
|
+
attr_writer :constraint
|
48
|
+
|
49
|
+
def initialize(name, constraint = NoConstraint.new)
|
50
|
+
@name = name
|
51
|
+
@constraint = constraint
|
52
|
+
@normalised = false
|
53
|
+
end
|
54
|
+
|
55
|
+
def normalised?
|
56
|
+
@normalised
|
57
|
+
end
|
58
|
+
|
59
|
+
def should_be_normalised
|
60
|
+
@normalised = true
|
61
|
+
end
|
62
|
+
|
63
|
+
def constraint_description
|
64
|
+
unless @constraint.to_s.empty?
|
65
|
+
name.to_s + " " + @constraint.to_s
|
66
|
+
else
|
67
|
+
""
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def should_be_constrained_to(value)
|
72
|
+
if value.is_a?(Regexp)
|
73
|
+
@constraint = RegexpConstraint.new(value)
|
74
|
+
else
|
75
|
+
@constraint = OneOfConstraint.new(value)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def valid_input?(value)
|
80
|
+
@constraint.satisfied_by?(value)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
class ConstantField < Field
|
85
|
+
attr_accessor :value
|
86
|
+
end
|
87
|
+
|
88
|
+
class FixedLengthField < Field
|
89
|
+
attr_reader :length
|
90
|
+
|
91
|
+
def initialize(name, length, constraint = nil)
|
92
|
+
super(name, constraint || FixedLengthConstraint.new(length))
|
93
|
+
@length = length
|
94
|
+
end
|
95
|
+
|
96
|
+
def extend_regexp_string_for_matching(regexp_string)
|
97
|
+
@constraint.extend_regexp_string_for_matching(regexp_string)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
class NoConstraint
|
102
|
+
def extend_regexp_string_for_matching(regexp_string)
|
103
|
+
regexp_string
|
104
|
+
end
|
105
|
+
|
106
|
+
def satisfied_by?(string)
|
107
|
+
true
|
108
|
+
end
|
109
|
+
|
110
|
+
def to_s; ""; end
|
111
|
+
end
|
112
|
+
|
113
|
+
class Constraint
|
114
|
+
def satisfied_by?(string)
|
115
|
+
string =~ Regexp.new(extend_regexp_string_for_matching(""))
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
class FixedLengthConstraint < Constraint
|
120
|
+
def initialize(length)
|
121
|
+
@length = length
|
122
|
+
end
|
123
|
+
|
124
|
+
def extend_regexp_string_for_matching(regexp_string)
|
125
|
+
regexp_string + "(.{#{@length}})"
|
126
|
+
end
|
127
|
+
|
128
|
+
def to_s; ""; end
|
129
|
+
end
|
130
|
+
|
131
|
+
class OneOfConstraint < Constraint
|
132
|
+
def initialize(*possible_values)
|
133
|
+
@possible_values = possible_values.flatten
|
134
|
+
end
|
135
|
+
|
136
|
+
def extend_regexp_string_for_matching(regexp_string)
|
137
|
+
regexp_string + "(#{@possible_values.join('|')})"
|
138
|
+
end
|
139
|
+
|
140
|
+
def to_s
|
141
|
+
if @possible_values.length == 1
|
142
|
+
"= #{@possible_values.first}"
|
143
|
+
else
|
144
|
+
"one of #{@possible_values.join(', ')}"
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
class RegexpConstraint < Constraint
|
150
|
+
def initialize(regexp_that_must_match)
|
151
|
+
@regexp_that_must_match = regexp_that_must_match
|
152
|
+
end
|
153
|
+
|
154
|
+
def extend_regexp_string_for_matching(regexp_string)
|
155
|
+
regexp_string + @regexp_that_must_match.source
|
156
|
+
end
|
157
|
+
|
158
|
+
def satisfied_by?(string)
|
159
|
+
string =~ @regexp_that_must_match
|
160
|
+
end
|
161
|
+
|
162
|
+
def to_s
|
163
|
+
"=~ #{@regexp_that_must_match.inspect}"
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|