structured_csv 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/exe/csv_join ADDED
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+ # resolve bin path, ignoring symlinks
3
+ require "pathname"
4
+ bin_file = Pathname.new(__FILE__).realpath
5
+
6
+ # add self to libpath
7
+ $:.unshift File.expand_path("../../lib", bin_file)
8
+
9
+ # Fixes https://github.com/rubygems/rubygems/issues/1420
10
+ require "rubygems/specification"
11
+
12
+ class Gem::Specification
13
+ def this
14
+ self
15
+ end
16
+ end
17
+
18
+ require "structured_csv/csv_join"
19
+
20
+ csvdir = ARGV.pop
21
+ outfile = Pathname.new(csvdir).sub_ext(".csv").to_s
22
+
23
+ # puts outfile
24
+
25
+ StructuredCsv::CsvJoin.convert(csvdir, outfile)
26
+
27
+ # puts StructuredCsv::CsvJoin.convert(csvdir)
data/exe/csv_join.rb ADDED
@@ -0,0 +1 @@
1
+ exe/csv_join
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+ # resolve bin path, ignoring symlinks
3
+ require "pathname"
4
+ bin_file = Pathname.new(__FILE__).realpath
5
+
6
+ # add self to libpath
7
+ $:.unshift File.expand_path("../../lib", bin_file)
8
+
9
+ # Fixes https://github.com/rubygems/rubygems/issues/1420
10
+ require "rubygems/specification"
11
+
12
+ class Gem::Specification
13
+ def this
14
+ self
15
+ end
16
+ end
17
+
18
+ require "structured_csv/csv2yaml"
19
+
20
+ csvfile = ARGV.pop
21
+ raise "first argument must be a .csv file!" unless /\.csv$/.match?(csvfile)
22
+
23
+ outfile = csvfile.gsub(/csv$/, "yaml")
24
+
25
+ IO.write(
26
+ outfile,
27
+ StructuredCsv::CsvTo2Yaml.convert(csvfile).to_yaml,
28
+ )
29
+
30
+ # pp Csv2Yaml.convert(filename)
@@ -0,0 +1 @@
1
+ exe/structured_csv_to_yaml
@@ -1,8 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "structured_csv/version"
4
-
5
- module StructuredCsv
6
- class Error < StandardError; end
7
- # Your code goes here...
8
- end
4
+ require_relative "structured_csv/common"
5
+ require_relative "structured_csv/csv2yaml"
6
+ require_relative "structured_csv/csv_join"
@@ -0,0 +1,12 @@
1
+ require "csv"
2
+
3
+ module StructuredCsv
4
+ module Common
5
+ def self.load_csv(csvfile)
6
+ # warn csvfile
7
+
8
+ content = File.read(csvfile, encoding: "bom|utf-8").scrub
9
+ CSV.parse(content, liberal_parsing: true, encoding: "UTF-8")
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,239 @@
1
+ require "csv"
2
+ require "yaml"
3
+
4
+ module StructuredCsv
5
+ module Csv2Yaml
6
+ def self.get_portion(csv, section_name)
7
+ first_row = nil
8
+ last_row = -1
9
+ data_meta = {}
10
+
11
+ warn "section_name #{section_name}"
12
+
13
+ csv.each_with_index do |row, index|
14
+ if first_row.nil? && is_start_of_portion?(row, section_name)
15
+ # warn"found first"
16
+
17
+ if row[1] && !row[1].empty?
18
+ row[1].split(";").each do |opt|
19
+ k, v = opt.split("=")
20
+ data_meta[k.to_sym] = v
21
+ end
22
+ end
23
+
24
+ first_row = index + 1
25
+ next
26
+ end
27
+
28
+ next unless !first_row.nil? && is_row_empty?(row)
29
+
30
+ # warn "found last"
31
+ last_row = index
32
+ break
33
+ end
34
+
35
+ # warn "first #{first_row} last #{last_row}"
36
+ {
37
+ first_row: first_row,
38
+ last_row: last_row,
39
+ rows: csv[(first_row.nil? ? 0 : first_row)..last_row],
40
+ meta: data_meta
41
+ }
42
+ end
43
+
44
+ def self.is_start_of_portion?(row, section_name)
45
+ return false if row.first.nil?
46
+
47
+ row.first.strip.to_s == section_name.to_s
48
+ end
49
+
50
+ def self.is_row_empty?(row)
51
+ row.map do |f|
52
+ f.is_a?(String) ? f.strip : f
53
+ end.all?(&:nil?)
54
+ end
55
+
56
+ def self.split_header_key_type(header_field)
57
+ field_name = ""
58
+ field_type = CAST_DEFAULT_TYPE
59
+
60
+ # warn header_field
61
+ arr = header_field.match(/\A([^\[]*)\[(.*)\]\Z/)
62
+
63
+ if arr.nil?
64
+ field_name = header_field
65
+ else
66
+ field_name = arr[1]
67
+ field_type = arr[2]
68
+ end
69
+
70
+ {
71
+ name: field_name,
72
+ type: field_type
73
+ }
74
+ end
75
+
76
+ CAST_DEFAULT_TYPE = "string".freeze
77
+
78
+ def self.cast_type(value, type_in_string)
79
+ return if value.nil?
80
+
81
+ type = type_in_string.downcase
82
+
83
+ case type
84
+ when "boolean"
85
+ if value == "true"
86
+ true
87
+ elsif value == "false"
88
+ false
89
+ end
90
+ when "integer"
91
+ value.to_s.strip.to_i
92
+ when "string"
93
+ value.to_s.strip
94
+ when /^array\{(.*)\}/
95
+ val_type = Regexp.last_match[1] || CAST_DEFAULT_TYPE
96
+ value.split(";").map do |v|
97
+ # warn "cast type as #{v}, #{val_type.to_s}"
98
+ cast_type(v, val_type.to_s)
99
+ end
100
+ else
101
+ value.to_s
102
+ end
103
+ end
104
+
105
+ def self.parse_metadata(rows)
106
+ hash = {}
107
+
108
+ rows.each_with_index do |row, _index|
109
+ # Skip all the empty rows
110
+ next if is_row_empty?(row)
111
+
112
+ name_type = split_header_key_type(row.first)
113
+ key = name_type[:name]
114
+ type = name_type[:type]
115
+
116
+ value = cast_type(row[1], type)
117
+ hash[key] = value
118
+ end
119
+
120
+ # warn "=============================METADATA================="
121
+ # pp hash
122
+ normalize_namespaces(hash)
123
+ end
124
+
125
+ def self.parse_data(rows, data_meta)
126
+ header = []
127
+ data_name = data_meta[:name]
128
+ data_type = data_meta[:type] || "hash"
129
+ data_key = data_meta[:key]
130
+
131
+ base_structure = case data_type
132
+ when "hash"
133
+ {}
134
+ when "array"
135
+ []
136
+ end
137
+
138
+ rows.each_with_index do |row, index|
139
+ # Assume the first column is always the key
140
+ if index == 0
141
+ # warn "row #{row}"
142
+ header = row.map do |field|
143
+ split_header_key_type(field) unless field.nil?
144
+ end.compact
145
+
146
+ data_key = header.first if data_type == "hash" && data_key.nil?
147
+
148
+ next
149
+ end
150
+ # warn "header #{header.inspect}"
151
+
152
+ # Skip all the empty rows
153
+ next if is_row_empty?(row)
154
+
155
+ # Skip if no key value
156
+ next if row[0].nil?
157
+
158
+ header_names = header.inject([]) do |acc, v|
159
+ acc << v[:name]
160
+ end
161
+
162
+ row_values = []
163
+ header.each_with_index do |h, i|
164
+ v = row[i]
165
+ v = v.strip unless v.nil?
166
+ row_values[i] = cast_type(v, h[:type])
167
+ end
168
+
169
+ k = row_values[0]
170
+ d = Hash[header_names[0..-1].zip(row_values[0..-1])]
171
+ # .transform_keys { |k| k.to_sym }
172
+
173
+ # Remove keys if they point to nil
174
+ d.keys.each do |k|
175
+ d.delete(k) if d[k].nil?
176
+ end
177
+
178
+ case data_type
179
+ when "hash"
180
+ unless base_structure[k].nil?
181
+ warn "[WARNING] there is already data inside key [#{k}] -- maybe you should set type=array?"
182
+ end
183
+ base_structure[k] = normalize_namespaces(d)
184
+ when "array"
185
+ base_structure << normalize_namespaces(d)
186
+ end
187
+ end
188
+
189
+ if data_name
190
+ base_structure = {
191
+ data_name => base_structure
192
+ }
193
+ end
194
+
195
+ base_structure
196
+ end
197
+
198
+ def self.convert(csv_filename)
199
+ raw_data = StructuredCsv::Common.load_csv(csv_filename)
200
+
201
+ metadata_section = get_portion(raw_data, "METADATA")
202
+ data_section = get_portion(raw_data, "DATA")
203
+
204
+ # warn '----------'
205
+ # pp data_section[:rows]
206
+ # warn '----------'
207
+
208
+ {
209
+ "metadata" => parse_metadata(metadata_section[:rows]),
210
+ "data" => parse_data(data_section[:rows], data_section[:meta])
211
+ }
212
+ end
213
+
214
+ # Structure all child hashes if the key is namespaced.
215
+ # e.g. { "hello.me" => data } becomes
216
+ # { "hello" => { "me" => data } }
217
+ #
218
+ def self.normalize_namespaces(hash)
219
+ new_hash = {}
220
+
221
+ hash.each_pair do |k, v|
222
+ # warn"k (#{k}) v (#{v})"
223
+ key_components = k.to_s.split(".")
224
+
225
+ level = new_hash
226
+ last_component = key_components.pop
227
+ key_components.each do |component|
228
+ # warn"c (#{component})"
229
+ level[component] ||= {}
230
+ level = level[component]
231
+ end
232
+
233
+ level[last_component] = v
234
+ end
235
+
236
+ new_hash
237
+ end
238
+ end
239
+ end
@@ -0,0 +1,78 @@
1
+ require "csv"
2
+ require "yaml"
3
+ require "pathname"
4
+
5
+ module StructuredCsv
6
+ module CsvJoin
7
+ def self.join(csv, section_name)
8
+ first_row = nil
9
+ last_row = -1
10
+
11
+ warn "section_name #{section_name}"
12
+
13
+ csv.each_with_index do |row, index|
14
+ if first_row.nil? && Csv2Yaml.is_start_of_portion?(row, section_name)
15
+ warn "found first"
16
+ first_row = index + 1
17
+ next
18
+ end
19
+
20
+ next unless !first_row.nil? && Csv2Yaml.is_row_empty?(row)
21
+
22
+ warn "found last"
23
+ last_row = index
24
+ break
25
+ end
26
+
27
+ warn "first #{first_row} last #{last_row}"
28
+ csv[first_row..last_row]
29
+ end
30
+
31
+ def self.convert(csvdir, outfile)
32
+ raise "first argument must be a directory!" unless File.directory?(csvdir)
33
+
34
+ csv = CSV.open(outfile, "wb", encoding: "UTF-8")
35
+
36
+ csvfiles = Dir.glob(File.join(csvdir, "**", "*.csv")).sort
37
+ raise "directory must contain .csv files!" if csvfiles.empty?
38
+
39
+ # Assume all files use the same header structure as the first CSV file
40
+ header = []
41
+ csvheader = ""
42
+
43
+ csvfiles.each do |csvfile|
44
+ content = StructuredCsv::Common.load_csv(csvfile)
45
+
46
+ csvheader = content.shift
47
+ if header.empty?
48
+ header = ["name"] + csvheader
49
+ csv << header
50
+ end
51
+
52
+ basename = Pathname.new(csvfile).basename.sub_ext("").to_s
53
+ content.each do |filerow|
54
+ row = []
55
+ filerow.each do |value|
56
+ row << case value
57
+ when String
58
+ value.strip
59
+ else
60
+ value
61
+ end
62
+ end
63
+
64
+ all_empty = row.all? do |f|
65
+ f.nil? || f.empty?
66
+ end
67
+ next if all_empty
68
+
69
+ row.unshift(basename)
70
+
71
+ csv << row
72
+ end
73
+ end
74
+
75
+ csv.close
76
+ end
77
+ end
78
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module StructuredCsv
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.1"
5
5
  end
@@ -8,12 +8,12 @@ Gem::Specification.new do |spec|
8
8
  spec.authors = ["Ribose Inc."]
9
9
  spec.email = ["open.source@ribose.com"]
10
10
 
11
- spec.summary = "Library to process structured CSV files"
12
- spec.description = "Library to process structured CSV files"
13
- spec.homepage = "https://open.ribose.com"
14
- spec.required_ruby_version = Gem::Requirement.new(">= 2.4.0")
11
+ spec.summary = "Library to process structured CSV files"
12
+ spec.description = "Library to process structured CSV files"
13
+ spec.homepage = "https://open.ribose.com"
14
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.6.7")
15
15
 
16
- spec.metadata["homepage_uri"] = spec.homepage
16
+ spec.metadata["homepage_uri"] = spec.homepage
17
17
  spec.metadata["source_code_uri"] = "https://github.com/riboseinc/structured_csv"
18
18
  # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
19
19
 
@@ -31,4 +31,15 @@ Gem::Specification.new do |spec|
31
31
 
32
32
  # For more information and examples about making a new gem, checkout our
33
33
  # guide at: https://bundler.io/guides/creating_gem.html
34
+
35
+ spec.add_dependency "csv", "~> 3.1"
36
+ # spec.add_dependency "pathname", "~> 0.1"
37
+ spec.add_dependency "yaml", "~> 0.1"
38
+
39
+ spec.add_development_dependency "byebug", "~> 11.1"
40
+ spec.add_development_dependency "guard", "~> 2.17"
41
+ spec.add_development_dependency "guard-rspec", "~> 4.7"
42
+ spec.add_development_dependency "rake", "~> 13.0"
43
+ spec.add_development_dependency "rspec", "~> 3.10"
44
+ spec.add_development_dependency "simplecov", "~> 0.21"
34
45
  end