structured_csv 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/exe/csv_join ADDED
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+ # resolve bin path, ignoring symlinks
3
+ require "pathname"
4
+ bin_file = Pathname.new(__FILE__).realpath
5
+
6
+ # add self to libpath
7
+ $:.unshift File.expand_path("../../lib", bin_file)
8
+
9
+ # Fixes https://github.com/rubygems/rubygems/issues/1420
10
+ require "rubygems/specification"
11
+
12
+ class Gem::Specification
13
+ def this
14
+ self
15
+ end
16
+ end
17
+
18
+ require "structured_csv/csv_join"
19
+
20
+ csvdir = ARGV.pop
21
+ outfile = Pathname.new(csvdir).sub_ext(".csv").to_s
22
+
23
+ # puts outfile
24
+
25
+ StructuredCsv::CsvJoin.convert(csvdir, outfile)
26
+
27
+ # puts StructuredCsv::CsvJoin.convert(csvdir)
data/exe/csv_join.rb ADDED
@@ -0,0 +1 @@
1
+ exe/csv_join
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+ # resolve bin path, ignoring symlinks
3
+ require "pathname"
4
+ bin_file = Pathname.new(__FILE__).realpath
5
+
6
+ # add self to libpath
7
+ $:.unshift File.expand_path("../../lib", bin_file)
8
+
9
+ # Fixes https://github.com/rubygems/rubygems/issues/1420
10
+ require "rubygems/specification"
11
+
12
+ class Gem::Specification
13
+ def this
14
+ self
15
+ end
16
+ end
17
+
18
+ require "structured_csv/csv2yaml"
19
+
20
+ csvfile = ARGV.pop
21
+ raise "first argument must be a .csv file!" unless /\.csv$/.match?(csvfile)
22
+
23
+ outfile = csvfile.gsub(/csv$/, "yaml")
24
+
25
+ IO.write(
26
+ outfile,
27
+ StructuredCsv::CsvTo2Yaml.convert(csvfile).to_yaml,
28
+ )
29
+
30
+ # pp Csv2Yaml.convert(filename)
@@ -0,0 +1 @@
1
+ exe/structured_csv_to_yaml
@@ -1,8 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "structured_csv/version"
4
-
5
- module StructuredCsv
6
- class Error < StandardError; end
7
- # Your code goes here...
8
- end
4
+ require_relative "structured_csv/common"
5
+ require_relative "structured_csv/csv2yaml"
6
+ require_relative "structured_csv/csv_join"
@@ -0,0 +1,12 @@
1
+ require "csv"
2
+
3
+ module StructuredCsv
4
+ module Common
5
+ def self.load_csv(csvfile)
6
+ # warn csvfile
7
+
8
+ content = File.read(csvfile, encoding: "bom|utf-8").scrub
9
+ CSV.parse(content, liberal_parsing: true, encoding: "UTF-8")
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,239 @@
1
+ require "csv"
2
+ require "yaml"
3
+
4
+ module StructuredCsv
5
+ module Csv2Yaml
6
+ def self.get_portion(csv, section_name)
7
+ first_row = nil
8
+ last_row = -1
9
+ data_meta = {}
10
+
11
+ warn "section_name #{section_name}"
12
+
13
+ csv.each_with_index do |row, index|
14
+ if first_row.nil? && is_start_of_portion?(row, section_name)
15
+ # warn"found first"
16
+
17
+ if row[1] && !row[1].empty?
18
+ row[1].split(";").each do |opt|
19
+ k, v = opt.split("=")
20
+ data_meta[k.to_sym] = v
21
+ end
22
+ end
23
+
24
+ first_row = index + 1
25
+ next
26
+ end
27
+
28
+ next unless !first_row.nil? && is_row_empty?(row)
29
+
30
+ # warn "found last"
31
+ last_row = index
32
+ break
33
+ end
34
+
35
+ # warn "first #{first_row} last #{last_row}"
36
+ {
37
+ first_row: first_row,
38
+ last_row: last_row,
39
+ rows: csv[(first_row.nil? ? 0 : first_row)..last_row],
40
+ meta: data_meta
41
+ }
42
+ end
43
+
44
+ def self.is_start_of_portion?(row, section_name)
45
+ return false if row.first.nil?
46
+
47
+ row.first.strip.to_s == section_name.to_s
48
+ end
49
+
50
+ def self.is_row_empty?(row)
51
+ row.map do |f|
52
+ f.is_a?(String) ? f.strip : f
53
+ end.all?(&:nil?)
54
+ end
55
+
56
+ def self.split_header_key_type(header_field)
57
+ field_name = ""
58
+ field_type = CAST_DEFAULT_TYPE
59
+
60
+ # warn header_field
61
+ arr = header_field.match(/\A([^\[]*)\[(.*)\]\Z/)
62
+
63
+ if arr.nil?
64
+ field_name = header_field
65
+ else
66
+ field_name = arr[1]
67
+ field_type = arr[2]
68
+ end
69
+
70
+ {
71
+ name: field_name,
72
+ type: field_type
73
+ }
74
+ end
75
+
76
+ CAST_DEFAULT_TYPE = "string".freeze
77
+
78
+ def self.cast_type(value, type_in_string)
79
+ return if value.nil?
80
+
81
+ type = type_in_string.downcase
82
+
83
+ case type
84
+ when "boolean"
85
+ if value == "true"
86
+ true
87
+ elsif value == "false"
88
+ false
89
+ end
90
+ when "integer"
91
+ value.to_s.strip.to_i
92
+ when "string"
93
+ value.to_s.strip
94
+ when /^array\{(.*)\}/
95
+ val_type = Regexp.last_match[1] || CAST_DEFAULT_TYPE
96
+ value.split(";").map do |v|
97
+ # warn "cast type as #{v}, #{val_type.to_s}"
98
+ cast_type(v, val_type.to_s)
99
+ end
100
+ else
101
+ value.to_s
102
+ end
103
+ end
104
+
105
+ def self.parse_metadata(rows)
106
+ hash = {}
107
+
108
+ rows.each_with_index do |row, _index|
109
+ # Skip all the empty rows
110
+ next if is_row_empty?(row)
111
+
112
+ name_type = split_header_key_type(row.first)
113
+ key = name_type[:name]
114
+ type = name_type[:type]
115
+
116
+ value = cast_type(row[1], type)
117
+ hash[key] = value
118
+ end
119
+
120
+ # warn "=============================METADATA================="
121
+ # pp hash
122
+ normalize_namespaces(hash)
123
+ end
124
+
125
+ def self.parse_data(rows, data_meta)
126
+ header = []
127
+ data_name = data_meta[:name]
128
+ data_type = data_meta[:type] || "hash"
129
+ data_key = data_meta[:key]
130
+
131
+ base_structure = case data_type
132
+ when "hash"
133
+ {}
134
+ when "array"
135
+ []
136
+ end
137
+
138
+ rows.each_with_index do |row, index|
139
+ # Assume the first column is always the key
140
+ if index == 0
141
+ # warn "row #{row}"
142
+ header = row.map do |field|
143
+ split_header_key_type(field) unless field.nil?
144
+ end.compact
145
+
146
+ data_key = header.first if data_type == "hash" && data_key.nil?
147
+
148
+ next
149
+ end
150
+ # warn "header #{header.inspect}"
151
+
152
+ # Skip all the empty rows
153
+ next if is_row_empty?(row)
154
+
155
+ # Skip if no key value
156
+ next if row[0].nil?
157
+
158
+ header_names = header.inject([]) do |acc, v|
159
+ acc << v[:name]
160
+ end
161
+
162
+ row_values = []
163
+ header.each_with_index do |h, i|
164
+ v = row[i]
165
+ v = v.strip unless v.nil?
166
+ row_values[i] = cast_type(v, h[:type])
167
+ end
168
+
169
+ k = row_values[0]
170
+ d = Hash[header_names[0..-1].zip(row_values[0..-1])]
171
+ # .transform_keys { |k| k.to_sym }
172
+
173
+ # Remove keys if they point to nil
174
+ d.keys.each do |k|
175
+ d.delete(k) if d[k].nil?
176
+ end
177
+
178
+ case data_type
179
+ when "hash"
180
+ unless base_structure[k].nil?
181
+ warn "[WARNING] there is already data inside key [#{k}] -- maybe you should set type=array?"
182
+ end
183
+ base_structure[k] = normalize_namespaces(d)
184
+ when "array"
185
+ base_structure << normalize_namespaces(d)
186
+ end
187
+ end
188
+
189
+ if data_name
190
+ base_structure = {
191
+ data_name => base_structure
192
+ }
193
+ end
194
+
195
+ base_structure
196
+ end
197
+
198
+ def self.convert(csv_filename)
199
+ raw_data = StructuredCsv::Common.load_csv(csv_filename)
200
+
201
+ metadata_section = get_portion(raw_data, "METADATA")
202
+ data_section = get_portion(raw_data, "DATA")
203
+
204
+ # warn '----------'
205
+ # pp data_section[:rows]
206
+ # warn '----------'
207
+
208
+ {
209
+ "metadata" => parse_metadata(metadata_section[:rows]),
210
+ "data" => parse_data(data_section[:rows], data_section[:meta])
211
+ }
212
+ end
213
+
214
+ # Structure all child hashes if the key is namespaced.
215
+ # e.g. { "hello.me" => data } becomes
216
+ # { "hello" => { "me" => data } }
217
+ #
218
+ def self.normalize_namespaces(hash)
219
+ new_hash = {}
220
+
221
+ hash.each_pair do |k, v|
222
+ # warn"k (#{k}) v (#{v})"
223
+ key_components = k.to_s.split(".")
224
+
225
+ level = new_hash
226
+ last_component = key_components.pop
227
+ key_components.each do |component|
228
+ # warn"c (#{component})"
229
+ level[component] ||= {}
230
+ level = level[component]
231
+ end
232
+
233
+ level[last_component] = v
234
+ end
235
+
236
+ new_hash
237
+ end
238
+ end
239
+ end
@@ -0,0 +1,78 @@
1
+ require "csv"
2
+ require "yaml"
3
+ require "pathname"
4
+
5
+ module StructuredCsv
6
+ module CsvJoin
7
+ def self.join(csv, section_name)
8
+ first_row = nil
9
+ last_row = -1
10
+
11
+ warn "section_name #{section_name}"
12
+
13
+ csv.each_with_index do |row, index|
14
+ if first_row.nil? && Csv2Yaml.is_start_of_portion?(row, section_name)
15
+ warn "found first"
16
+ first_row = index + 1
17
+ next
18
+ end
19
+
20
+ next unless !first_row.nil? && Csv2Yaml.is_row_empty?(row)
21
+
22
+ warn "found last"
23
+ last_row = index
24
+ break
25
+ end
26
+
27
+ warn "first #{first_row} last #{last_row}"
28
+ csv[first_row..last_row]
29
+ end
30
+
31
+ def self.convert(csvdir, outfile)
32
+ raise "first argument must be a directory!" unless File.directory?(csvdir)
33
+
34
+ csv = CSV.open(outfile, "wb", encoding: "UTF-8")
35
+
36
+ csvfiles = Dir.glob(File.join(csvdir, "**", "*.csv")).sort
37
+ raise "directory must contain .csv files!" if csvfiles.empty?
38
+
39
+ # Assume all files use the same header structure as the first CSV file
40
+ header = []
41
+ csvheader = ""
42
+
43
+ csvfiles.each do |csvfile|
44
+ content = StructuredCsv::Common.load_csv(csvfile)
45
+
46
+ csvheader = content.shift
47
+ if header.empty?
48
+ header = ["name"] + csvheader
49
+ csv << header
50
+ end
51
+
52
+ basename = Pathname.new(csvfile).basename.sub_ext("").to_s
53
+ content.each do |filerow|
54
+ row = []
55
+ filerow.each do |value|
56
+ row << case value
57
+ when String
58
+ value.strip
59
+ else
60
+ value
61
+ end
62
+ end
63
+
64
+ all_empty = row.all? do |f|
65
+ f.nil? || f.empty?
66
+ end
67
+ next if all_empty
68
+
69
+ row.unshift(basename)
70
+
71
+ csv << row
72
+ end
73
+ end
74
+
75
+ csv.close
76
+ end
77
+ end
78
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module StructuredCsv
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.1"
5
5
  end
@@ -8,12 +8,12 @@ Gem::Specification.new do |spec|
8
8
  spec.authors = ["Ribose Inc."]
9
9
  spec.email = ["open.source@ribose.com"]
10
10
 
11
- spec.summary = "Library to process structured CSV files"
12
- spec.description = "Library to process structured CSV files"
13
- spec.homepage = "https://open.ribose.com"
14
- spec.required_ruby_version = Gem::Requirement.new(">= 2.4.0")
11
+ spec.summary = "Library to process structured CSV files"
12
+ spec.description = "Library to process structured CSV files"
13
+ spec.homepage = "https://open.ribose.com"
14
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.6.7")
15
15
 
16
- spec.metadata["homepage_uri"] = spec.homepage
16
+ spec.metadata["homepage_uri"] = spec.homepage
17
17
  spec.metadata["source_code_uri"] = "https://github.com/riboseinc/structured_csv"
18
18
  # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
19
19
 
@@ -31,4 +31,15 @@ Gem::Specification.new do |spec|
31
31
 
32
32
  # For more information and examples about making a new gem, checkout our
33
33
  # guide at: https://bundler.io/guides/creating_gem.html
34
+
35
+ spec.add_dependency "csv", "~> 3.1"
36
+ # spec.add_dependency "pathname", "~> 0.1"
37
+ spec.add_dependency "yaml", "~> 0.1"
38
+
39
+ spec.add_development_dependency "byebug", "~> 11.1"
40
+ spec.add_development_dependency "guard", "~> 2.17"
41
+ spec.add_development_dependency "guard-rspec", "~> 4.7"
42
+ spec.add_development_dependency "rake", "~> 13.0"
43
+ spec.add_development_dependency "rspec", "~> 3.10"
44
+ spec.add_development_dependency "simplecov", "~> 0.21"
34
45
  end