structured_csv 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.editorconfig +18 -0
- data/.gitattributes +1 -0
- data/.github/workflows/main.yml +6 -2
- data/.gitignore +256 -6
- data/.hound.yml +3 -0
- data/.rubocop.yml +40 -3
- data/Gemfile +7 -5
- data/README.adoc +29 -3
- data/exe/csv_join +27 -0
- data/exe/csv_join.rb +1 -0
- data/exe/structured_csv_to_yaml +30 -0
- data/exe/structured_csv_to_yaml.rb +1 -0
- data/lib/structured_csv.rb +3 -5
- data/lib/structured_csv/common.rb +12 -0
- data/lib/structured_csv/csv2yaml.rb +239 -0
- data/lib/structured_csv/csv_join.rb +78 -0
- data/lib/structured_csv/version.rb +1 -1
- data/structured_csv.gemspec +16 -5
- metadata +127 -5
- data/exe/csv_join.rb +0 -95
- data/exe/structured_csv_to_yaml.rb +0 -254
data/exe/csv_join
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# resolve bin path, ignoring symlinks
|
3
|
+
require "pathname"
|
4
|
+
bin_file = Pathname.new(__FILE__).realpath
|
5
|
+
|
6
|
+
# add self to libpath
|
7
|
+
$:.unshift File.expand_path("../../lib", bin_file)
|
8
|
+
|
9
|
+
# Fixes https://github.com/rubygems/rubygems/issues/1420
|
10
|
+
require "rubygems/specification"
|
11
|
+
|
12
|
+
class Gem::Specification
|
13
|
+
def this
|
14
|
+
self
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
require "structured_csv/csv_join"
|
19
|
+
|
20
|
+
csvdir = ARGV.pop
|
21
|
+
outfile = Pathname.new(csvdir).sub_ext(".csv").to_s
|
22
|
+
|
23
|
+
# puts outfile
|
24
|
+
|
25
|
+
StructuredCsv::CsvJoin.convert(csvdir, outfile)
|
26
|
+
|
27
|
+
# puts StructuredCsv::CsvJoin.convert(csvdir)
|
data/exe/csv_join.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
exe/csv_join
|
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# resolve bin path, ignoring symlinks
|
3
|
+
require "pathname"
|
4
|
+
bin_file = Pathname.new(__FILE__).realpath
|
5
|
+
|
6
|
+
# add self to libpath
|
7
|
+
$:.unshift File.expand_path("../../lib", bin_file)
|
8
|
+
|
9
|
+
# Fixes https://github.com/rubygems/rubygems/issues/1420
|
10
|
+
require "rubygems/specification"
|
11
|
+
|
12
|
+
class Gem::Specification
|
13
|
+
def this
|
14
|
+
self
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
require "structured_csv/csv2yaml"
|
19
|
+
|
20
|
+
csvfile = ARGV.pop
|
21
|
+
raise "first argument must be a .csv file!" unless /\.csv$/.match?(csvfile)
|
22
|
+
|
23
|
+
outfile = csvfile.gsub(/csv$/, "yaml")
|
24
|
+
|
25
|
+
IO.write(
|
26
|
+
outfile,
|
27
|
+
StructuredCsv::CsvTo2Yaml.convert(csvfile).to_yaml,
|
28
|
+
)
|
29
|
+
|
30
|
+
# pp Csv2Yaml.convert(filename)
|
@@ -0,0 +1 @@
|
|
1
|
+
exe/structured_csv_to_yaml
|
data/lib/structured_csv.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative "structured_csv/version"
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
# Your code goes here...
|
8
|
-
end
|
4
|
+
require_relative "structured_csv/common"
|
5
|
+
require_relative "structured_csv/csv2yaml"
|
6
|
+
require_relative "structured_csv/csv_join"
|
@@ -0,0 +1,239 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "yaml"
|
3
|
+
|
4
|
+
module StructuredCsv
|
5
|
+
module Csv2Yaml
|
6
|
+
def self.get_portion(csv, section_name)
|
7
|
+
first_row = nil
|
8
|
+
last_row = -1
|
9
|
+
data_meta = {}
|
10
|
+
|
11
|
+
warn "section_name #{section_name}"
|
12
|
+
|
13
|
+
csv.each_with_index do |row, index|
|
14
|
+
if first_row.nil? && is_start_of_portion?(row, section_name)
|
15
|
+
# warn"found first"
|
16
|
+
|
17
|
+
if row[1] && !row[1].empty?
|
18
|
+
row[1].split(";").each do |opt|
|
19
|
+
k, v = opt.split("=")
|
20
|
+
data_meta[k.to_sym] = v
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
first_row = index + 1
|
25
|
+
next
|
26
|
+
end
|
27
|
+
|
28
|
+
next unless !first_row.nil? && is_row_empty?(row)
|
29
|
+
|
30
|
+
# warn "found last"
|
31
|
+
last_row = index
|
32
|
+
break
|
33
|
+
end
|
34
|
+
|
35
|
+
# warn "first #{first_row} last #{last_row}"
|
36
|
+
{
|
37
|
+
first_row: first_row,
|
38
|
+
last_row: last_row,
|
39
|
+
rows: csv[(first_row.nil? ? 0 : first_row)..last_row],
|
40
|
+
meta: data_meta
|
41
|
+
}
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.is_start_of_portion?(row, section_name)
|
45
|
+
return false if row.first.nil?
|
46
|
+
|
47
|
+
row.first.strip.to_s == section_name.to_s
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.is_row_empty?(row)
|
51
|
+
row.map do |f|
|
52
|
+
f.is_a?(String) ? f.strip : f
|
53
|
+
end.all?(&:nil?)
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.split_header_key_type(header_field)
|
57
|
+
field_name = ""
|
58
|
+
field_type = CAST_DEFAULT_TYPE
|
59
|
+
|
60
|
+
# warn header_field
|
61
|
+
arr = header_field.match(/\A([^\[]*)\[(.*)\]\Z/)
|
62
|
+
|
63
|
+
if arr.nil?
|
64
|
+
field_name = header_field
|
65
|
+
else
|
66
|
+
field_name = arr[1]
|
67
|
+
field_type = arr[2]
|
68
|
+
end
|
69
|
+
|
70
|
+
{
|
71
|
+
name: field_name,
|
72
|
+
type: field_type
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
CAST_DEFAULT_TYPE = "string".freeze
|
77
|
+
|
78
|
+
def self.cast_type(value, type_in_string)
|
79
|
+
return if value.nil?
|
80
|
+
|
81
|
+
type = type_in_string.downcase
|
82
|
+
|
83
|
+
case type
|
84
|
+
when "boolean"
|
85
|
+
if value == "true"
|
86
|
+
true
|
87
|
+
elsif value == "false"
|
88
|
+
false
|
89
|
+
end
|
90
|
+
when "integer"
|
91
|
+
value.to_s.strip.to_i
|
92
|
+
when "string"
|
93
|
+
value.to_s.strip
|
94
|
+
when /^array\{(.*)\}/
|
95
|
+
val_type = Regexp.last_match[1] || CAST_DEFAULT_TYPE
|
96
|
+
value.split(";").map do |v|
|
97
|
+
# warn "cast type as #{v}, #{val_type.to_s}"
|
98
|
+
cast_type(v, val_type.to_s)
|
99
|
+
end
|
100
|
+
else
|
101
|
+
value.to_s
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.parse_metadata(rows)
|
106
|
+
hash = {}
|
107
|
+
|
108
|
+
rows.each_with_index do |row, _index|
|
109
|
+
# Skip all the empty rows
|
110
|
+
next if is_row_empty?(row)
|
111
|
+
|
112
|
+
name_type = split_header_key_type(row.first)
|
113
|
+
key = name_type[:name]
|
114
|
+
type = name_type[:type]
|
115
|
+
|
116
|
+
value = cast_type(row[1], type)
|
117
|
+
hash[key] = value
|
118
|
+
end
|
119
|
+
|
120
|
+
# warn "=============================METADATA================="
|
121
|
+
# pp hash
|
122
|
+
normalize_namespaces(hash)
|
123
|
+
end
|
124
|
+
|
125
|
+
def self.parse_data(rows, data_meta)
|
126
|
+
header = []
|
127
|
+
data_name = data_meta[:name]
|
128
|
+
data_type = data_meta[:type] || "hash"
|
129
|
+
data_key = data_meta[:key]
|
130
|
+
|
131
|
+
base_structure = case data_type
|
132
|
+
when "hash"
|
133
|
+
{}
|
134
|
+
when "array"
|
135
|
+
[]
|
136
|
+
end
|
137
|
+
|
138
|
+
rows.each_with_index do |row, index|
|
139
|
+
# Assume the first column is always the key
|
140
|
+
if index == 0
|
141
|
+
# warn "row #{row}"
|
142
|
+
header = row.map do |field|
|
143
|
+
split_header_key_type(field) unless field.nil?
|
144
|
+
end.compact
|
145
|
+
|
146
|
+
data_key = header.first if data_type == "hash" && data_key.nil?
|
147
|
+
|
148
|
+
next
|
149
|
+
end
|
150
|
+
# warn "header #{header.inspect}"
|
151
|
+
|
152
|
+
# Skip all the empty rows
|
153
|
+
next if is_row_empty?(row)
|
154
|
+
|
155
|
+
# Skip if no key value
|
156
|
+
next if row[0].nil?
|
157
|
+
|
158
|
+
header_names = header.inject([]) do |acc, v|
|
159
|
+
acc << v[:name]
|
160
|
+
end
|
161
|
+
|
162
|
+
row_values = []
|
163
|
+
header.each_with_index do |h, i|
|
164
|
+
v = row[i]
|
165
|
+
v = v.strip unless v.nil?
|
166
|
+
row_values[i] = cast_type(v, h[:type])
|
167
|
+
end
|
168
|
+
|
169
|
+
k = row_values[0]
|
170
|
+
d = Hash[header_names[0..-1].zip(row_values[0..-1])]
|
171
|
+
# .transform_keys { |k| k.to_sym }
|
172
|
+
|
173
|
+
# Remove keys if they point to nil
|
174
|
+
d.keys.each do |k|
|
175
|
+
d.delete(k) if d[k].nil?
|
176
|
+
end
|
177
|
+
|
178
|
+
case data_type
|
179
|
+
when "hash"
|
180
|
+
unless base_structure[k].nil?
|
181
|
+
warn "[WARNING] there is already data inside key [#{k}] -- maybe you should set type=array?"
|
182
|
+
end
|
183
|
+
base_structure[k] = normalize_namespaces(d)
|
184
|
+
when "array"
|
185
|
+
base_structure << normalize_namespaces(d)
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
if data_name
|
190
|
+
base_structure = {
|
191
|
+
data_name => base_structure
|
192
|
+
}
|
193
|
+
end
|
194
|
+
|
195
|
+
base_structure
|
196
|
+
end
|
197
|
+
|
198
|
+
def self.convert(csv_filename)
|
199
|
+
raw_data = StructuredCsv::Common.load_csv(csv_filename)
|
200
|
+
|
201
|
+
metadata_section = get_portion(raw_data, "METADATA")
|
202
|
+
data_section = get_portion(raw_data, "DATA")
|
203
|
+
|
204
|
+
# warn '----------'
|
205
|
+
# pp data_section[:rows]
|
206
|
+
# warn '----------'
|
207
|
+
|
208
|
+
{
|
209
|
+
"metadata" => parse_metadata(metadata_section[:rows]),
|
210
|
+
"data" => parse_data(data_section[:rows], data_section[:meta])
|
211
|
+
}
|
212
|
+
end
|
213
|
+
|
214
|
+
# Structure all child hashes if the key is namespaced.
|
215
|
+
# e.g. { "hello.me" => data } becomes
|
216
|
+
# { "hello" => { "me" => data } }
|
217
|
+
#
|
218
|
+
def self.normalize_namespaces(hash)
|
219
|
+
new_hash = {}
|
220
|
+
|
221
|
+
hash.each_pair do |k, v|
|
222
|
+
# warn"k (#{k}) v (#{v})"
|
223
|
+
key_components = k.to_s.split(".")
|
224
|
+
|
225
|
+
level = new_hash
|
226
|
+
last_component = key_components.pop
|
227
|
+
key_components.each do |component|
|
228
|
+
# warn"c (#{component})"
|
229
|
+
level[component] ||= {}
|
230
|
+
level = level[component]
|
231
|
+
end
|
232
|
+
|
233
|
+
level[last_component] = v
|
234
|
+
end
|
235
|
+
|
236
|
+
new_hash
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "yaml"
|
3
|
+
require "pathname"
|
4
|
+
|
5
|
+
module StructuredCsv
|
6
|
+
module CsvJoin
|
7
|
+
def self.join(csv, section_name)
|
8
|
+
first_row = nil
|
9
|
+
last_row = -1
|
10
|
+
|
11
|
+
warn "section_name #{section_name}"
|
12
|
+
|
13
|
+
csv.each_with_index do |row, index|
|
14
|
+
if first_row.nil? && Csv2Yaml.is_start_of_portion?(row, section_name)
|
15
|
+
warn "found first"
|
16
|
+
first_row = index + 1
|
17
|
+
next
|
18
|
+
end
|
19
|
+
|
20
|
+
next unless !first_row.nil? && Csv2Yaml.is_row_empty?(row)
|
21
|
+
|
22
|
+
warn "found last"
|
23
|
+
last_row = index
|
24
|
+
break
|
25
|
+
end
|
26
|
+
|
27
|
+
warn "first #{first_row} last #{last_row}"
|
28
|
+
csv[first_row..last_row]
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.convert(csvdir, outfile)
|
32
|
+
raise "first argument must be a directory!" unless File.directory?(csvdir)
|
33
|
+
|
34
|
+
csv = CSV.open(outfile, "wb", encoding: "UTF-8")
|
35
|
+
|
36
|
+
csvfiles = Dir.glob(File.join(csvdir, "**", "*.csv")).sort
|
37
|
+
raise "directory must contain .csv files!" if csvfiles.empty?
|
38
|
+
|
39
|
+
# Assume all files use the same header structure as the first CSV file
|
40
|
+
header = []
|
41
|
+
csvheader = ""
|
42
|
+
|
43
|
+
csvfiles.each do |csvfile|
|
44
|
+
content = StructuredCsv::Common.load_csv(csvfile)
|
45
|
+
|
46
|
+
csvheader = content.shift
|
47
|
+
if header.empty?
|
48
|
+
header = ["name"] + csvheader
|
49
|
+
csv << header
|
50
|
+
end
|
51
|
+
|
52
|
+
basename = Pathname.new(csvfile).basename.sub_ext("").to_s
|
53
|
+
content.each do |filerow|
|
54
|
+
row = []
|
55
|
+
filerow.each do |value|
|
56
|
+
row << case value
|
57
|
+
when String
|
58
|
+
value.strip
|
59
|
+
else
|
60
|
+
value
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
all_empty = row.all? do |f|
|
65
|
+
f.nil? || f.empty?
|
66
|
+
end
|
67
|
+
next if all_empty
|
68
|
+
|
69
|
+
row.unshift(basename)
|
70
|
+
|
71
|
+
csv << row
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
csv.close
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
data/structured_csv.gemspec
CHANGED
@@ -8,12 +8,12 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.authors = ["Ribose Inc."]
|
9
9
|
spec.email = ["open.source@ribose.com"]
|
10
10
|
|
11
|
-
spec.summary
|
12
|
-
spec.description
|
13
|
-
spec.homepage
|
14
|
-
spec.required_ruby_version = Gem::Requirement.new(">= 2.
|
11
|
+
spec.summary = "Library to process structured CSV files"
|
12
|
+
spec.description = "Library to process structured CSV files"
|
13
|
+
spec.homepage = "https://open.ribose.com"
|
14
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.6.7")
|
15
15
|
|
16
|
-
spec.metadata["homepage_uri"]
|
16
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
17
17
|
spec.metadata["source_code_uri"] = "https://github.com/riboseinc/structured_csv"
|
18
18
|
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
19
19
|
|
@@ -31,4 +31,15 @@ Gem::Specification.new do |spec|
|
|
31
31
|
|
32
32
|
# For more information and examples about making a new gem, checkout our
|
33
33
|
# guide at: https://bundler.io/guides/creating_gem.html
|
34
|
+
|
35
|
+
spec.add_dependency "csv", "~> 3.1"
|
36
|
+
# spec.add_dependency "pathname", "~> 0.1"
|
37
|
+
spec.add_dependency "yaml", "~> 0.1"
|
38
|
+
|
39
|
+
spec.add_development_dependency "byebug", "~> 11.1"
|
40
|
+
spec.add_development_dependency "guard", "~> 2.17"
|
41
|
+
spec.add_development_dependency "guard-rspec", "~> 4.7"
|
42
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
43
|
+
spec.add_development_dependency "rspec", "~> 3.10"
|
44
|
+
spec.add_development_dependency "simplecov", "~> 0.21"
|
34
45
|
end
|