ascii-data-tools 0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/.rvmrc +1 -0
- data/.travis.yml +4 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +40 -0
- data/LICENSE.GPL2 +339 -0
- data/README.rdoc +52 -0
- data/Rakefile +42 -0
- data/TODO +4 -0
- data/ascii-data-tools.gemspec +30 -0
- data/bin/ascii-data-cat +13 -0
- data/bin/ascii-data-edit +13 -0
- data/bin/ascii-data-norm +13 -0
- data/bin/ascii-data-qdiff +13 -0
- data/bin/ascii-data-tools-config +9 -0
- data/examples/big +10000 -0
- data/examples/built_in_records.gz +0 -0
- data/examples/slightly_modified_built_in_records.gz +0 -0
- data/features/ascii-data-cat.feature +110 -0
- data/features/ascii-data-edit.feature +91 -0
- data/features/ascii-data-qdiff.feature +54 -0
- data/features/encoding_decoding.feature +68 -0
- data/features/normaliser.feature +27 -0
- data/features/plugins.feature +73 -0
- data/features/record_recognition.feature +61 -0
- data/features/step_definitions/ascii-data-cat_steps.rb +48 -0
- data/features/step_definitions/ascii-data-edit_steps.rb +38 -0
- data/features/step_definitions/ascii-data-norm_steps.rb +7 -0
- data/features/step_definitions/ascii-data-qdiff_steps.rb +43 -0
- data/features/step_definitions/encoding_decoding_steps.rb +23 -0
- data/features/step_definitions/plugins_steps.rb +11 -0
- data/features/step_definitions/record_recognition_steps.rb +10 -0
- data/features/support/env.rb +5 -0
- data/lib/ascii-data-tools.rb +8 -0
- data/lib/ascii-data-tools/configuration.rb +169 -0
- data/lib/ascii-data-tools/configuration_printer.rb +38 -0
- data/lib/ascii-data-tools/controller.rb +123 -0
- data/lib/ascii-data-tools/discover.rb +19 -0
- data/lib/ascii-data-tools/external_programs.rb +23 -0
- data/lib/ascii-data-tools/filter.rb +148 -0
- data/lib/ascii-data-tools/filter/diffing.rb +139 -0
- data/lib/ascii-data-tools/formatting.rb +109 -0
- data/lib/ascii-data-tools/global_autodiscovery.rb +21 -0
- data/lib/ascii-data-tools/record.rb +50 -0
- data/lib/ascii-data-tools/record_type.rb +139 -0
- data/lib/ascii-data-tools/record_type/builder.rb +50 -0
- data/lib/ascii-data-tools/record_type/decoder.rb +77 -0
- data/lib/ascii-data-tools/record_type/encoder.rb +17 -0
- data/lib/ascii-data-tools/record_type/field.rb +168 -0
- data/lib/ascii-data-tools/record_type/normaliser.rb +38 -0
- data/lib/ascii-data-tools/ruby_extensions.rb +7 -0
- data/lib/ascii-data-tools/version.rb +3 -0
- data/spec/ascii-data-tools/configuration_printer_spec.rb +51 -0
- data/spec/ascii-data-tools/configuration_spec.rb +153 -0
- data/spec/ascii-data-tools/discover_spec.rb +8 -0
- data/spec/ascii-data-tools/filter/diffing_spec.rb +82 -0
- data/spec/ascii-data-tools/filter_spec.rb +107 -0
- data/spec/ascii-data-tools/formatting_spec.rb +106 -0
- data/spec/ascii-data-tools/record_spec.rb +49 -0
- data/spec/ascii-data-tools/record_type/builder_spec.rb +69 -0
- data/spec/ascii-data-tools/record_type/decoder_spec.rb +73 -0
- data/spec/ascii-data-tools/record_type/encoder_spec.rb +32 -0
- data/spec/ascii-data-tools/record_type/field_spec.rb +160 -0
- data/spec/ascii-data-tools/record_type/normaliser_spec.rb +25 -0
- data/spec/ascii-data-tools/record_type_spec.rb +175 -0
- data/spec/filter_helper.rb +24 -0
- data/spec/record_type_helpers.rb +8 -0
- data/spec/spec.opts +2 -0
- data/spec/spec_helper.rb +5 -0
- metadata +196 -0
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module AsciiDataTools
|
4
|
+
module Record
|
5
|
+
class Record
|
6
|
+
attr_reader :type
|
7
|
+
|
8
|
+
def initialize(type, content_values)
|
9
|
+
@type = type
|
10
|
+
@values_by_type = {:content => content_values}
|
11
|
+
end
|
12
|
+
|
13
|
+
def [](requested_field_name)
|
14
|
+
requested_key_value_pair = field_names_and_values.detect {|field_name, value| field_name == requested_field_name }
|
15
|
+
raise "Field name '#{requested_field_name}' does not exist!" if requested_key_value_pair.nil?
|
16
|
+
requested_key_value_pair.last
|
17
|
+
end
|
18
|
+
|
19
|
+
def type_name
|
20
|
+
@type.name
|
21
|
+
end
|
22
|
+
|
23
|
+
def to_a
|
24
|
+
@type.field_names.zip(values)
|
25
|
+
end
|
26
|
+
|
27
|
+
def values
|
28
|
+
@values_by_type[:content]
|
29
|
+
end
|
30
|
+
|
31
|
+
def encode
|
32
|
+
@type.encode(values)
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_s
|
36
|
+
contents = field_names_and_values.map {|field_name, value| "#{field_name} => #{value.inspect}"}.join(", ")
|
37
|
+
"#{type_name}: #{contents}"
|
38
|
+
end
|
39
|
+
|
40
|
+
def ==(other)
|
41
|
+
self.type == other.type and self.values == other.values
|
42
|
+
end
|
43
|
+
|
44
|
+
protected
|
45
|
+
def field_names_and_values
|
46
|
+
@type.field_names.zip(values)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'forwardable'
|
3
|
+
require 'ascii-data-tools/record_type/field'
|
4
|
+
require 'ascii-data-tools/record_type/builder'
|
5
|
+
require 'ascii-data-tools/record_type/normaliser'
|
6
|
+
require 'ascii-data-tools/record_type/decoder'
|
7
|
+
require 'ascii-data-tools/record_type/encoder'
|
8
|
+
|
9
|
+
module AsciiDataTools
|
10
|
+
module RecordType
|
11
|
+
module FixedLengthType
|
12
|
+
include Decoder::FixedLengthRecordDecoder
|
13
|
+
include Encoder::FixedLengthRecordEncoder
|
14
|
+
include Normaliser::Normaliser
|
15
|
+
|
16
|
+
def total_length_of_fields
|
17
|
+
@total_length ||= fields.inject(0) {|sum, field| sum + field.length}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
module CsvType
|
22
|
+
include Decoder::CsvRecordDecoder
|
23
|
+
include Encoder::CsvRecordEncoder
|
24
|
+
end
|
25
|
+
|
26
|
+
class Type
|
27
|
+
extend Forwardable
|
28
|
+
attr_reader :name
|
29
|
+
|
30
|
+
def_delegator :fields, :names, :field_names
|
31
|
+
def_delegator :fields, :with_index, :field_with_index
|
32
|
+
|
33
|
+
def initialize(name, content_fields = Field::Fields.new)
|
34
|
+
@name = name
|
35
|
+
@fields_by_type = {:content => content_fields, :meta => make_meta_fields}
|
36
|
+
end
|
37
|
+
|
38
|
+
def field_with_name(name)
|
39
|
+
all_fields.with_name(name)
|
40
|
+
end
|
41
|
+
|
42
|
+
def method_missing(method_name, *args, &block)
|
43
|
+
content_fields.send(method_name, *args, &block)
|
44
|
+
end
|
45
|
+
|
46
|
+
def filename_should_match(value)
|
47
|
+
field_with_name(:filename).should_be_constrained_to(value)
|
48
|
+
end
|
49
|
+
|
50
|
+
protected
|
51
|
+
attr_reader :fields_by_type
|
52
|
+
|
53
|
+
def content_fields
|
54
|
+
@fields_by_type[:content]
|
55
|
+
end
|
56
|
+
|
57
|
+
alias :fields :content_fields
|
58
|
+
|
59
|
+
def make_meta_fields
|
60
|
+
Field::Fields.new([Field::Field.new(:filename), Field::ConstantField.new(:divider)])
|
61
|
+
end
|
62
|
+
|
63
|
+
def all_fields
|
64
|
+
Field::Fields.new(@fields_by_type[:content] + @fields_by_type[:meta])
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class UnknownType < Type
|
69
|
+
include Decoder::UnknownRecordDecoder
|
70
|
+
UNKNOWN_RECORD_TYPE_NAME = "unknown"
|
71
|
+
|
72
|
+
def initialize
|
73
|
+
super(UNKNOWN_RECORD_TYPE_NAME, Field::Fields.new([Field::Field.new("UNKNOWN")]))
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
class TypeDeterminer
|
78
|
+
def initialize(type_repo = RecordTypeRepository.new)
|
79
|
+
@all_types = type_repo
|
80
|
+
@previously_matched_types = RecordTypeRepository.new
|
81
|
+
end
|
82
|
+
|
83
|
+
def determine_type_for(encoded_record)
|
84
|
+
matching_type =
|
85
|
+
@previously_matched_types.identify_type_for(encoded_record) ||
|
86
|
+
@all_types.identify_type_for(encoded_record)
|
87
|
+
if matching_type.nil?
|
88
|
+
return UnknownType.new
|
89
|
+
else
|
90
|
+
@previously_matched_types << matching_type
|
91
|
+
return matching_type
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
class RecordTypeRepository
|
97
|
+
include Enumerable
|
98
|
+
include Builder::TypeBuilder
|
99
|
+
|
100
|
+
def initialize(types = [])
|
101
|
+
@types = Set.new(types)
|
102
|
+
end
|
103
|
+
|
104
|
+
def <<(type)
|
105
|
+
@types << type
|
106
|
+
end
|
107
|
+
|
108
|
+
def clear
|
109
|
+
@types.clear
|
110
|
+
end
|
111
|
+
|
112
|
+
def find_by_name(name)
|
113
|
+
detect {|type| type.name == name}
|
114
|
+
end
|
115
|
+
|
116
|
+
alias :type :find_by_name
|
117
|
+
|
118
|
+
def each(&block)
|
119
|
+
@types.each(&block)
|
120
|
+
end
|
121
|
+
|
122
|
+
def identify_type_for(encoded_record)
|
123
|
+
@types.detect {|type| type.able_to_decode?(encoded_record) }
|
124
|
+
end
|
125
|
+
|
126
|
+
def for_names_matching(matcher, &block)
|
127
|
+
if matcher.is_a?(Regexp)
|
128
|
+
select {|type| type.name =~ matcher}.each {|found_type| block[found_type]}
|
129
|
+
elsif matcher.is_a?(Proc)
|
130
|
+
select {|type| matcher[type.name]}.each {|found_type| block[found_type]}
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
def record_type(name, props = {}, &definition)
|
135
|
+
self << build_type(name, props, &definition)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'ascii-data-tools/record_type/field'
|
2
|
+
|
3
|
+
module AsciiDataTools
|
4
|
+
module RecordType
|
5
|
+
module Builder
|
6
|
+
module FieldBuilder
|
7
|
+
def build_field(name, properties = {})
|
8
|
+
field = Field::FixedLengthField.new(name, properties[:length])
|
9
|
+
field.should_be_constrained_to(properties[:constrained_to]) unless properties[:constrained_to].nil?
|
10
|
+
field.should_be_normalised if properties[:normalised]
|
11
|
+
field
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
module TypeBuilder
|
16
|
+
include FieldBuilder
|
17
|
+
def build_type(type_name, properties = {}, &block)
|
18
|
+
build_fields(&block)
|
19
|
+
type = Type.new(type_name, @fields)
|
20
|
+
|
21
|
+
type_family = determine_type_family_from(properties)
|
22
|
+
type.extend(type_family)
|
23
|
+
|
24
|
+
type.field_with_name(:filename).should_be_constrained_to(properties[:applies_for_filenames_matching])
|
25
|
+
type.field_with_name(:divider).value = properties[:divider]
|
26
|
+
type
|
27
|
+
end
|
28
|
+
|
29
|
+
def build_fields(&block)
|
30
|
+
@fields = Field::Fields.new
|
31
|
+
instance_eval(&block) unless block.nil?
|
32
|
+
@fields
|
33
|
+
end
|
34
|
+
|
35
|
+
protected
|
36
|
+
def field(name, properties = {})
|
37
|
+
@fields << build_field(name, properties)
|
38
|
+
end
|
39
|
+
|
40
|
+
def determine_type_family_from(properties)
|
41
|
+
case properties[:family]
|
42
|
+
when "csv" then CsvType
|
43
|
+
when "fixed_length" then FixedLengthType
|
44
|
+
when NilClass then FixedLengthType
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module AsciiDataTools
|
2
|
+
module RecordType
|
3
|
+
module Decoder
|
4
|
+
module RecordDecoder
|
5
|
+
def able_to_decode?(encoded_record)
|
6
|
+
able_to_decode_content?(encoded_record[:ascii_string]) and meta_fields_valid?(encoded_record)
|
7
|
+
end
|
8
|
+
|
9
|
+
def decode(encoded_record)
|
10
|
+
Record::Record.new(self, split_into_values(encoded_record[:ascii_string]))
|
11
|
+
end
|
12
|
+
|
13
|
+
protected
|
14
|
+
def able_to_decode_content?(encoded_string)
|
15
|
+
raise "Must be implemented in submodule!"
|
16
|
+
end
|
17
|
+
|
18
|
+
def split_into_values(ascii_string)
|
19
|
+
raise "Must be implemented in submodule!"
|
20
|
+
end
|
21
|
+
|
22
|
+
def meta_fields_valid?(encoded_record)
|
23
|
+
encoded_record[:filename].nil? or filename_field.valid_input?(encoded_record[:filename])
|
24
|
+
end
|
25
|
+
|
26
|
+
def filename_field
|
27
|
+
@filename_field ||= fields_by_type[:meta].with_name(:filename)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
module RegexpBasedDecoder
|
32
|
+
include RecordDecoder
|
33
|
+
|
34
|
+
protected
|
35
|
+
def able_to_decode_content?(encoded_string)
|
36
|
+
encoded_string =~ regexp_for_matching_type
|
37
|
+
end
|
38
|
+
|
39
|
+
def split_into_values(ascii_string)
|
40
|
+
ascii_string.match(regexp_for_matching_type).to_a[1..-1]
|
41
|
+
end
|
42
|
+
|
43
|
+
def regexp_for_matching_type
|
44
|
+
@regexp ||= Regexp.new(regexp_string, Regexp::MULTILINE)
|
45
|
+
end
|
46
|
+
|
47
|
+
def regexp_string
|
48
|
+
raise "Must be implemented in submodule!"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
module FixedLengthRecordDecoder
|
53
|
+
include RegexpBasedDecoder
|
54
|
+
|
55
|
+
protected
|
56
|
+
def regexp_string
|
57
|
+
content_fields.inject("\\A") {|regexp_string, field| field.extend_regexp_string_for_matching(regexp_string) } + "\\z"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
module CsvRecordDecoder
|
62
|
+
include RegexpBasedDecoder
|
63
|
+
|
64
|
+
protected
|
65
|
+
def regexp_string
|
66
|
+
"\\A" + "(.*?)" + ((field_with_name(:divider).value + "(.*?)") * (content_fields.length - 1)) + "\n\\z"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
module UnknownRecordDecoder
|
71
|
+
def decode(encoded_record)
|
72
|
+
Record::Record.new(self, [encoded_record[:ascii_string]])
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module AsciiDataTools
|
2
|
+
module RecordType
|
3
|
+
module Encoder
|
4
|
+
module FixedLengthRecordEncoder
|
5
|
+
def encode(values)
|
6
|
+
values.join
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
module CsvRecordEncoder
|
11
|
+
def encode(values)
|
12
|
+
values.join(field_with_name(:divider).value) + "\n"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,168 @@
|
|
1
|
+
module AsciiDataTools
|
2
|
+
module RecordType
|
3
|
+
module Field
|
4
|
+
module GreppableFields
|
5
|
+
def with_name(field_name)
|
6
|
+
self.detect {|field| field.name == field_name}
|
7
|
+
end
|
8
|
+
|
9
|
+
def fields_with(&block)
|
10
|
+
Fields.new(self.select(&block))
|
11
|
+
end
|
12
|
+
|
13
|
+
def with_index(index)
|
14
|
+
self[index-1]
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class Fields < Array
|
19
|
+
include GreppableFields
|
20
|
+
def names
|
21
|
+
self.collect {|f| f.name}
|
22
|
+
end
|
23
|
+
|
24
|
+
def number_of_content_fields
|
25
|
+
self.size
|
26
|
+
end
|
27
|
+
|
28
|
+
def length_of_longest_field_name
|
29
|
+
@length_of_longest_field_name ||= names.max_by {|name| name.length }.length
|
30
|
+
end
|
31
|
+
|
32
|
+
def constraints_description
|
33
|
+
self.reject {|field| field.constraint_description.empty? }.map {|field| field.constraint_description}.join(", ")
|
34
|
+
end
|
35
|
+
|
36
|
+
def should_be_normalised
|
37
|
+
self.each {|field| field.should_be_normalised}
|
38
|
+
end
|
39
|
+
|
40
|
+
def names_of_normalised_fields
|
41
|
+
self.select {|field| field.normalised?}.map {|field| field.name}.join(", ")
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
class Field
|
46
|
+
attr_reader :name
|
47
|
+
attr_writer :constraint
|
48
|
+
|
49
|
+
def initialize(name, constraint = NoConstraint.new)
|
50
|
+
@name = name
|
51
|
+
@constraint = constraint
|
52
|
+
@normalised = false
|
53
|
+
end
|
54
|
+
|
55
|
+
def normalised?
|
56
|
+
@normalised
|
57
|
+
end
|
58
|
+
|
59
|
+
def should_be_normalised
|
60
|
+
@normalised = true
|
61
|
+
end
|
62
|
+
|
63
|
+
def constraint_description
|
64
|
+
unless @constraint.to_s.empty?
|
65
|
+
name.to_s + " " + @constraint.to_s
|
66
|
+
else
|
67
|
+
""
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def should_be_constrained_to(value)
|
72
|
+
if value.is_a?(Regexp)
|
73
|
+
@constraint = RegexpConstraint.new(value)
|
74
|
+
else
|
75
|
+
@constraint = OneOfConstraint.new(value)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def valid_input?(value)
|
80
|
+
@constraint.satisfied_by?(value)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
class ConstantField < Field
|
85
|
+
attr_accessor :value
|
86
|
+
end
|
87
|
+
|
88
|
+
class FixedLengthField < Field
|
89
|
+
attr_reader :length
|
90
|
+
|
91
|
+
def initialize(name, length, constraint = nil)
|
92
|
+
super(name, constraint || FixedLengthConstraint.new(length))
|
93
|
+
@length = length
|
94
|
+
end
|
95
|
+
|
96
|
+
def extend_regexp_string_for_matching(regexp_string)
|
97
|
+
@constraint.extend_regexp_string_for_matching(regexp_string)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
class NoConstraint
|
102
|
+
def extend_regexp_string_for_matching(regexp_string)
|
103
|
+
regexp_string
|
104
|
+
end
|
105
|
+
|
106
|
+
def satisfied_by?(string)
|
107
|
+
true
|
108
|
+
end
|
109
|
+
|
110
|
+
def to_s; ""; end
|
111
|
+
end
|
112
|
+
|
113
|
+
class Constraint
|
114
|
+
def satisfied_by?(string)
|
115
|
+
string =~ Regexp.new(extend_regexp_string_for_matching(""))
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
class FixedLengthConstraint < Constraint
|
120
|
+
def initialize(length)
|
121
|
+
@length = length
|
122
|
+
end
|
123
|
+
|
124
|
+
def extend_regexp_string_for_matching(regexp_string)
|
125
|
+
regexp_string + "(.{#{@length}})"
|
126
|
+
end
|
127
|
+
|
128
|
+
def to_s; ""; end
|
129
|
+
end
|
130
|
+
|
131
|
+
class OneOfConstraint < Constraint
|
132
|
+
def initialize(*possible_values)
|
133
|
+
@possible_values = possible_values.flatten
|
134
|
+
end
|
135
|
+
|
136
|
+
def extend_regexp_string_for_matching(regexp_string)
|
137
|
+
regexp_string + "(#{@possible_values.join('|')})"
|
138
|
+
end
|
139
|
+
|
140
|
+
def to_s
|
141
|
+
if @possible_values.length == 1
|
142
|
+
"= #{@possible_values.first}"
|
143
|
+
else
|
144
|
+
"one of #{@possible_values.join(', ')}"
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
class RegexpConstraint < Constraint
|
150
|
+
def initialize(regexp_that_must_match)
|
151
|
+
@regexp_that_must_match = regexp_that_must_match
|
152
|
+
end
|
153
|
+
|
154
|
+
def extend_regexp_string_for_matching(regexp_string)
|
155
|
+
regexp_string + @regexp_that_must_match.source
|
156
|
+
end
|
157
|
+
|
158
|
+
def satisfied_by?(string)
|
159
|
+
string =~ @regexp_that_must_match
|
160
|
+
end
|
161
|
+
|
162
|
+
def to_s
|
163
|
+
"=~ #{@regexp_that_must_match.inspect}"
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|