ascii-data-tools 0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. data/.gitignore +3 -0
  2. data/.rvmrc +1 -0
  3. data/.travis.yml +4 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +40 -0
  6. data/LICENSE.GPL2 +339 -0
  7. data/README.rdoc +52 -0
  8. data/Rakefile +42 -0
  9. data/TODO +4 -0
  10. data/ascii-data-tools.gemspec +30 -0
  11. data/bin/ascii-data-cat +13 -0
  12. data/bin/ascii-data-edit +13 -0
  13. data/bin/ascii-data-norm +13 -0
  14. data/bin/ascii-data-qdiff +13 -0
  15. data/bin/ascii-data-tools-config +9 -0
  16. data/examples/big +10000 -0
  17. data/examples/built_in_records.gz +0 -0
  18. data/examples/slightly_modified_built_in_records.gz +0 -0
  19. data/features/ascii-data-cat.feature +110 -0
  20. data/features/ascii-data-edit.feature +91 -0
  21. data/features/ascii-data-qdiff.feature +54 -0
  22. data/features/encoding_decoding.feature +68 -0
  23. data/features/normaliser.feature +27 -0
  24. data/features/plugins.feature +73 -0
  25. data/features/record_recognition.feature +61 -0
  26. data/features/step_definitions/ascii-data-cat_steps.rb +48 -0
  27. data/features/step_definitions/ascii-data-edit_steps.rb +38 -0
  28. data/features/step_definitions/ascii-data-norm_steps.rb +7 -0
  29. data/features/step_definitions/ascii-data-qdiff_steps.rb +43 -0
  30. data/features/step_definitions/encoding_decoding_steps.rb +23 -0
  31. data/features/step_definitions/plugins_steps.rb +11 -0
  32. data/features/step_definitions/record_recognition_steps.rb +10 -0
  33. data/features/support/env.rb +5 -0
  34. data/lib/ascii-data-tools.rb +8 -0
  35. data/lib/ascii-data-tools/configuration.rb +169 -0
  36. data/lib/ascii-data-tools/configuration_printer.rb +38 -0
  37. data/lib/ascii-data-tools/controller.rb +123 -0
  38. data/lib/ascii-data-tools/discover.rb +19 -0
  39. data/lib/ascii-data-tools/external_programs.rb +23 -0
  40. data/lib/ascii-data-tools/filter.rb +148 -0
  41. data/lib/ascii-data-tools/filter/diffing.rb +139 -0
  42. data/lib/ascii-data-tools/formatting.rb +109 -0
  43. data/lib/ascii-data-tools/global_autodiscovery.rb +21 -0
  44. data/lib/ascii-data-tools/record.rb +50 -0
  45. data/lib/ascii-data-tools/record_type.rb +139 -0
  46. data/lib/ascii-data-tools/record_type/builder.rb +50 -0
  47. data/lib/ascii-data-tools/record_type/decoder.rb +77 -0
  48. data/lib/ascii-data-tools/record_type/encoder.rb +17 -0
  49. data/lib/ascii-data-tools/record_type/field.rb +168 -0
  50. data/lib/ascii-data-tools/record_type/normaliser.rb +38 -0
  51. data/lib/ascii-data-tools/ruby_extensions.rb +7 -0
  52. data/lib/ascii-data-tools/version.rb +3 -0
  53. data/spec/ascii-data-tools/configuration_printer_spec.rb +51 -0
  54. data/spec/ascii-data-tools/configuration_spec.rb +153 -0
  55. data/spec/ascii-data-tools/discover_spec.rb +8 -0
  56. data/spec/ascii-data-tools/filter/diffing_spec.rb +82 -0
  57. data/spec/ascii-data-tools/filter_spec.rb +107 -0
  58. data/spec/ascii-data-tools/formatting_spec.rb +106 -0
  59. data/spec/ascii-data-tools/record_spec.rb +49 -0
  60. data/spec/ascii-data-tools/record_type/builder_spec.rb +69 -0
  61. data/spec/ascii-data-tools/record_type/decoder_spec.rb +73 -0
  62. data/spec/ascii-data-tools/record_type/encoder_spec.rb +32 -0
  63. data/spec/ascii-data-tools/record_type/field_spec.rb +160 -0
  64. data/spec/ascii-data-tools/record_type/normaliser_spec.rb +25 -0
  65. data/spec/ascii-data-tools/record_type_spec.rb +175 -0
  66. data/spec/filter_helper.rb +24 -0
  67. data/spec/record_type_helpers.rb +8 -0
  68. data/spec/spec.opts +2 -0
  69. data/spec/spec_helper.rb +5 -0
  70. metadata +196 -0
@@ -0,0 +1,50 @@
1
+ require 'set'
2
+
3
+ module AsciiDataTools
4
+ module Record
5
+ class Record
6
+ attr_reader :type
7
+
8
+ def initialize(type, content_values)
9
+ @type = type
10
+ @values_by_type = {:content => content_values}
11
+ end
12
+
13
+ def [](requested_field_name)
14
+ requested_key_value_pair = field_names_and_values.detect {|field_name, value| field_name == requested_field_name }
15
+ raise "Field name '#{requested_field_name}' does not exist!" if requested_key_value_pair.nil?
16
+ requested_key_value_pair.last
17
+ end
18
+
19
+ def type_name
20
+ @type.name
21
+ end
22
+
23
+ def to_a
24
+ @type.field_names.zip(values)
25
+ end
26
+
27
+ def values
28
+ @values_by_type[:content]
29
+ end
30
+
31
+ def encode
32
+ @type.encode(values)
33
+ end
34
+
35
+ def to_s
36
+ contents = field_names_and_values.map {|field_name, value| "#{field_name} => #{value.inspect}"}.join(", ")
37
+ "#{type_name}: #{contents}"
38
+ end
39
+
40
+ def ==(other)
41
+ self.type == other.type and self.values == other.values
42
+ end
43
+
44
+ protected
45
+ def field_names_and_values
46
+ @type.field_names.zip(values)
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,139 @@
1
+ require 'set'
2
+ require 'forwardable'
3
+ require 'ascii-data-tools/record_type/field'
4
+ require 'ascii-data-tools/record_type/builder'
5
+ require 'ascii-data-tools/record_type/normaliser'
6
+ require 'ascii-data-tools/record_type/decoder'
7
+ require 'ascii-data-tools/record_type/encoder'
8
+
9
+ module AsciiDataTools
10
+ module RecordType
11
+ module FixedLengthType
12
+ include Decoder::FixedLengthRecordDecoder
13
+ include Encoder::FixedLengthRecordEncoder
14
+ include Normaliser::Normaliser
15
+
16
+ def total_length_of_fields
17
+ @total_length ||= fields.inject(0) {|sum, field| sum + field.length}
18
+ end
19
+ end
20
+
21
+ module CsvType
22
+ include Decoder::CsvRecordDecoder
23
+ include Encoder::CsvRecordEncoder
24
+ end
25
+
26
+ class Type
27
+ extend Forwardable
28
+ attr_reader :name
29
+
30
+ def_delegator :fields, :names, :field_names
31
+ def_delegator :fields, :with_index, :field_with_index
32
+
33
+ def initialize(name, content_fields = Field::Fields.new)
34
+ @name = name
35
+ @fields_by_type = {:content => content_fields, :meta => make_meta_fields}
36
+ end
37
+
38
+ def field_with_name(name)
39
+ all_fields.with_name(name)
40
+ end
41
+
42
+ def method_missing(method_name, *args, &block)
43
+ content_fields.send(method_name, *args, &block)
44
+ end
45
+
46
+ def filename_should_match(value)
47
+ field_with_name(:filename).should_be_constrained_to(value)
48
+ end
49
+
50
+ protected
51
+ attr_reader :fields_by_type
52
+
53
+ def content_fields
54
+ @fields_by_type[:content]
55
+ end
56
+
57
+ alias :fields :content_fields
58
+
59
+ def make_meta_fields
60
+ Field::Fields.new([Field::Field.new(:filename), Field::ConstantField.new(:divider)])
61
+ end
62
+
63
+ def all_fields
64
+ Field::Fields.new(@fields_by_type[:content] + @fields_by_type[:meta])
65
+ end
66
+ end
67
+
68
+ class UnknownType < Type
69
+ include Decoder::UnknownRecordDecoder
70
+ UNKNOWN_RECORD_TYPE_NAME = "unknown"
71
+
72
+ def initialize
73
+ super(UNKNOWN_RECORD_TYPE_NAME, Field::Fields.new([Field::Field.new("UNKNOWN")]))
74
+ end
75
+ end
76
+
77
+ class TypeDeterminer
78
+ def initialize(type_repo = RecordTypeRepository.new)
79
+ @all_types = type_repo
80
+ @previously_matched_types = RecordTypeRepository.new
81
+ end
82
+
83
+ def determine_type_for(encoded_record)
84
+ matching_type =
85
+ @previously_matched_types.identify_type_for(encoded_record) ||
86
+ @all_types.identify_type_for(encoded_record)
87
+ if matching_type.nil?
88
+ return UnknownType.new
89
+ else
90
+ @previously_matched_types << matching_type
91
+ return matching_type
92
+ end
93
+ end
94
+ end
95
+
96
+ class RecordTypeRepository
97
+ include Enumerable
98
+ include Builder::TypeBuilder
99
+
100
+ def initialize(types = [])
101
+ @types = Set.new(types)
102
+ end
103
+
104
+ def <<(type)
105
+ @types << type
106
+ end
107
+
108
+ def clear
109
+ @types.clear
110
+ end
111
+
112
+ def find_by_name(name)
113
+ detect {|type| type.name == name}
114
+ end
115
+
116
+ alias :type :find_by_name
117
+
118
+ def each(&block)
119
+ @types.each(&block)
120
+ end
121
+
122
+ def identify_type_for(encoded_record)
123
+ @types.detect {|type| type.able_to_decode?(encoded_record) }
124
+ end
125
+
126
+ def for_names_matching(matcher, &block)
127
+ if matcher.is_a?(Regexp)
128
+ select {|type| type.name =~ matcher}.each {|found_type| block[found_type]}
129
+ elsif matcher.is_a?(Proc)
130
+ select {|type| matcher[type.name]}.each {|found_type| block[found_type]}
131
+ end
132
+ end
133
+
134
+ def record_type(name, props = {}, &definition)
135
+ self << build_type(name, props, &definition)
136
+ end
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,50 @@
1
+ require 'ascii-data-tools/record_type/field'
2
+
3
+ module AsciiDataTools
4
+ module RecordType
5
+ module Builder
6
+ module FieldBuilder
7
+ def build_field(name, properties = {})
8
+ field = Field::FixedLengthField.new(name, properties[:length])
9
+ field.should_be_constrained_to(properties[:constrained_to]) unless properties[:constrained_to].nil?
10
+ field.should_be_normalised if properties[:normalised]
11
+ field
12
+ end
13
+ end
14
+
15
+ module TypeBuilder
16
+ include FieldBuilder
17
+ def build_type(type_name, properties = {}, &block)
18
+ build_fields(&block)
19
+ type = Type.new(type_name, @fields)
20
+
21
+ type_family = determine_type_family_from(properties)
22
+ type.extend(type_family)
23
+
24
+ type.field_with_name(:filename).should_be_constrained_to(properties[:applies_for_filenames_matching])
25
+ type.field_with_name(:divider).value = properties[:divider]
26
+ type
27
+ end
28
+
29
+ def build_fields(&block)
30
+ @fields = Field::Fields.new
31
+ instance_eval(&block) unless block.nil?
32
+ @fields
33
+ end
34
+
35
+ protected
36
+ def field(name, properties = {})
37
+ @fields << build_field(name, properties)
38
+ end
39
+
40
+ def determine_type_family_from(properties)
41
+ case properties[:family]
42
+ when "csv" then CsvType
43
+ when "fixed_length" then FixedLengthType
44
+ when NilClass then FixedLengthType
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,77 @@
1
+ module AsciiDataTools
2
+ module RecordType
3
+ module Decoder
4
+ module RecordDecoder
5
+ def able_to_decode?(encoded_record)
6
+ able_to_decode_content?(encoded_record[:ascii_string]) and meta_fields_valid?(encoded_record)
7
+ end
8
+
9
+ def decode(encoded_record)
10
+ Record::Record.new(self, split_into_values(encoded_record[:ascii_string]))
11
+ end
12
+
13
+ protected
14
+ def able_to_decode_content?(encoded_string)
15
+ raise "Must be implemented in submodule!"
16
+ end
17
+
18
+ def split_into_values(ascii_string)
19
+ raise "Must be implemented in submodule!"
20
+ end
21
+
22
+ def meta_fields_valid?(encoded_record)
23
+ encoded_record[:filename].nil? or filename_field.valid_input?(encoded_record[:filename])
24
+ end
25
+
26
+ def filename_field
27
+ @filename_field ||= fields_by_type[:meta].with_name(:filename)
28
+ end
29
+ end
30
+
31
+ module RegexpBasedDecoder
32
+ include RecordDecoder
33
+
34
+ protected
35
+ def able_to_decode_content?(encoded_string)
36
+ encoded_string =~ regexp_for_matching_type
37
+ end
38
+
39
+ def split_into_values(ascii_string)
40
+ ascii_string.match(regexp_for_matching_type).to_a[1..-1]
41
+ end
42
+
43
+ def regexp_for_matching_type
44
+ @regexp ||= Regexp.new(regexp_string, Regexp::MULTILINE)
45
+ end
46
+
47
+ def regexp_string
48
+ raise "Must be implemented in submodule!"
49
+ end
50
+ end
51
+
52
+ module FixedLengthRecordDecoder
53
+ include RegexpBasedDecoder
54
+
55
+ protected
56
+ def regexp_string
57
+ content_fields.inject("\\A") {|regexp_string, field| field.extend_regexp_string_for_matching(regexp_string) } + "\\z"
58
+ end
59
+ end
60
+
61
+ module CsvRecordDecoder
62
+ include RegexpBasedDecoder
63
+
64
+ protected
65
+ def regexp_string
66
+ "\\A" + "(.*?)" + ((field_with_name(:divider).value + "(.*?)") * (content_fields.length - 1)) + "\n\\z"
67
+ end
68
+ end
69
+
70
+ module UnknownRecordDecoder
71
+ def decode(encoded_record)
72
+ Record::Record.new(self, [encoded_record[:ascii_string]])
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,17 @@
1
+ module AsciiDataTools
2
+ module RecordType
3
+ module Encoder
4
+ module FixedLengthRecordEncoder
5
+ def encode(values)
6
+ values.join
7
+ end
8
+ end
9
+
10
+ module CsvRecordEncoder
11
+ def encode(values)
12
+ values.join(field_with_name(:divider).value) + "\n"
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,168 @@
1
+ module AsciiDataTools
2
+ module RecordType
3
+ module Field
4
+ module GreppableFields
5
+ def with_name(field_name)
6
+ self.detect {|field| field.name == field_name}
7
+ end
8
+
9
+ def fields_with(&block)
10
+ Fields.new(self.select(&block))
11
+ end
12
+
13
+ def with_index(index)
14
+ self[index-1]
15
+ end
16
+ end
17
+
18
+ class Fields < Array
19
+ include GreppableFields
20
+ def names
21
+ self.collect {|f| f.name}
22
+ end
23
+
24
+ def number_of_content_fields
25
+ self.size
26
+ end
27
+
28
+ def length_of_longest_field_name
29
+ @length_of_longest_field_name ||= names.max_by {|name| name.length }.length
30
+ end
31
+
32
+ def constraints_description
33
+ self.reject {|field| field.constraint_description.empty? }.map {|field| field.constraint_description}.join(", ")
34
+ end
35
+
36
+ def should_be_normalised
37
+ self.each {|field| field.should_be_normalised}
38
+ end
39
+
40
+ def names_of_normalised_fields
41
+ self.select {|field| field.normalised?}.map {|field| field.name}.join(", ")
42
+ end
43
+ end
44
+
45
+ class Field
46
+ attr_reader :name
47
+ attr_writer :constraint
48
+
49
+ def initialize(name, constraint = NoConstraint.new)
50
+ @name = name
51
+ @constraint = constraint
52
+ @normalised = false
53
+ end
54
+
55
+ def normalised?
56
+ @normalised
57
+ end
58
+
59
+ def should_be_normalised
60
+ @normalised = true
61
+ end
62
+
63
+ def constraint_description
64
+ unless @constraint.to_s.empty?
65
+ name.to_s + " " + @constraint.to_s
66
+ else
67
+ ""
68
+ end
69
+ end
70
+
71
+ def should_be_constrained_to(value)
72
+ if value.is_a?(Regexp)
73
+ @constraint = RegexpConstraint.new(value)
74
+ else
75
+ @constraint = OneOfConstraint.new(value)
76
+ end
77
+ end
78
+
79
+ def valid_input?(value)
80
+ @constraint.satisfied_by?(value)
81
+ end
82
+ end
83
+
84
+ class ConstantField < Field
85
+ attr_accessor :value
86
+ end
87
+
88
+ class FixedLengthField < Field
89
+ attr_reader :length
90
+
91
+ def initialize(name, length, constraint = nil)
92
+ super(name, constraint || FixedLengthConstraint.new(length))
93
+ @length = length
94
+ end
95
+
96
+ def extend_regexp_string_for_matching(regexp_string)
97
+ @constraint.extend_regexp_string_for_matching(regexp_string)
98
+ end
99
+ end
100
+
101
+ class NoConstraint
102
+ def extend_regexp_string_for_matching(regexp_string)
103
+ regexp_string
104
+ end
105
+
106
+ def satisfied_by?(string)
107
+ true
108
+ end
109
+
110
+ def to_s; ""; end
111
+ end
112
+
113
+ class Constraint
114
+ def satisfied_by?(string)
115
+ string =~ Regexp.new(extend_regexp_string_for_matching(""))
116
+ end
117
+ end
118
+
119
+ class FixedLengthConstraint < Constraint
120
+ def initialize(length)
121
+ @length = length
122
+ end
123
+
124
+ def extend_regexp_string_for_matching(regexp_string)
125
+ regexp_string + "(.{#{@length}})"
126
+ end
127
+
128
+ def to_s; ""; end
129
+ end
130
+
131
+ class OneOfConstraint < Constraint
132
+ def initialize(*possible_values)
133
+ @possible_values = possible_values.flatten
134
+ end
135
+
136
+ def extend_regexp_string_for_matching(regexp_string)
137
+ regexp_string + "(#{@possible_values.join('|')})"
138
+ end
139
+
140
+ def to_s
141
+ if @possible_values.length == 1
142
+ "= #{@possible_values.first}"
143
+ else
144
+ "one of #{@possible_values.join(', ')}"
145
+ end
146
+ end
147
+ end
148
+
149
+ class RegexpConstraint < Constraint
150
+ def initialize(regexp_that_must_match)
151
+ @regexp_that_must_match = regexp_that_must_match
152
+ end
153
+
154
+ def extend_regexp_string_for_matching(regexp_string)
155
+ regexp_string + @regexp_that_must_match.source
156
+ end
157
+
158
+ def satisfied_by?(string)
159
+ string =~ @regexp_that_must_match
160
+ end
161
+
162
+ def to_s
163
+ "=~ #{@regexp_that_must_match.inspect}"
164
+ end
165
+ end
166
+ end
167
+ end
168
+ end