structured_csv 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +18 -0
- data/.gitattributes +1 -0
- data/.github/workflows/main.yml +6 -2
- data/.gitignore +256 -6
- data/.hound.yml +3 -0
- data/.rubocop.yml +40 -3
- data/Gemfile +7 -5
- data/README.adoc +29 -3
- data/exe/csv_join +27 -0
- data/exe/csv_join.rb +1 -0
- data/exe/structured_csv_to_yaml +30 -0
- data/exe/structured_csv_to_yaml.rb +1 -0
- data/lib/structured_csv.rb +3 -5
- data/lib/structured_csv/common.rb +12 -0
- data/lib/structured_csv/csv2yaml.rb +239 -0
- data/lib/structured_csv/csv_join.rb +78 -0
- data/lib/structured_csv/version.rb +1 -1
- data/structured_csv.gemspec +16 -5
- metadata +127 -5
- data/exe/csv_join.rb +0 -95
- data/exe/structured_csv_to_yaml.rb +0 -254
data/exe/csv_join
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# resolve bin path, ignoring symlinks
|
3
|
+
require "pathname"
|
4
|
+
bin_file = Pathname.new(__FILE__).realpath
|
5
|
+
|
6
|
+
# add self to libpath
|
7
|
+
$:.unshift File.expand_path("../../lib", bin_file)
|
8
|
+
|
9
|
+
# Fixes https://github.com/rubygems/rubygems/issues/1420
|
10
|
+
require "rubygems/specification"
|
11
|
+
|
12
|
+
class Gem::Specification
|
13
|
+
def this
|
14
|
+
self
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
require "structured_csv/csv_join"
|
19
|
+
|
20
|
+
csvdir = ARGV.pop
|
21
|
+
outfile = Pathname.new(csvdir).sub_ext(".csv").to_s
|
22
|
+
|
23
|
+
# puts outfile
|
24
|
+
|
25
|
+
StructuredCsv::CsvJoin.convert(csvdir, outfile)
|
26
|
+
|
27
|
+
# puts StructuredCsv::CsvJoin.convert(csvdir)
|
data/exe/csv_join.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
exe/csv_join
|
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# resolve bin path, ignoring symlinks
|
3
|
+
require "pathname"
|
4
|
+
bin_file = Pathname.new(__FILE__).realpath
|
5
|
+
|
6
|
+
# add self to libpath
|
7
|
+
$:.unshift File.expand_path("../../lib", bin_file)
|
8
|
+
|
9
|
+
# Fixes https://github.com/rubygems/rubygems/issues/1420
|
10
|
+
require "rubygems/specification"
|
11
|
+
|
12
|
+
class Gem::Specification
|
13
|
+
def this
|
14
|
+
self
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
require "structured_csv/csv2yaml"
|
19
|
+
|
20
|
+
csvfile = ARGV.pop
|
21
|
+
raise "first argument must be a .csv file!" unless /\.csv$/.match?(csvfile)
|
22
|
+
|
23
|
+
outfile = csvfile.gsub(/csv$/, "yaml")
|
24
|
+
|
25
|
+
IO.write(
|
26
|
+
outfile,
|
27
|
+
StructuredCsv::CsvTo2Yaml.convert(csvfile).to_yaml,
|
28
|
+
)
|
29
|
+
|
30
|
+
# pp Csv2Yaml.convert(filename)
|
@@ -0,0 +1 @@
|
|
1
|
+
exe/structured_csv_to_yaml
|
data/lib/structured_csv.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative "structured_csv/version"
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
# Your code goes here...
|
8
|
-
end
|
4
|
+
require_relative "structured_csv/common"
|
5
|
+
require_relative "structured_csv/csv2yaml"
|
6
|
+
require_relative "structured_csv/csv_join"
|
@@ -0,0 +1,239 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "yaml"
|
3
|
+
|
4
|
+
module StructuredCsv
|
5
|
+
module Csv2Yaml
|
6
|
+
def self.get_portion(csv, section_name)
|
7
|
+
first_row = nil
|
8
|
+
last_row = -1
|
9
|
+
data_meta = {}
|
10
|
+
|
11
|
+
warn "section_name #{section_name}"
|
12
|
+
|
13
|
+
csv.each_with_index do |row, index|
|
14
|
+
if first_row.nil? && is_start_of_portion?(row, section_name)
|
15
|
+
# warn"found first"
|
16
|
+
|
17
|
+
if row[1] && !row[1].empty?
|
18
|
+
row[1].split(";").each do |opt|
|
19
|
+
k, v = opt.split("=")
|
20
|
+
data_meta[k.to_sym] = v
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
first_row = index + 1
|
25
|
+
next
|
26
|
+
end
|
27
|
+
|
28
|
+
next unless !first_row.nil? && is_row_empty?(row)
|
29
|
+
|
30
|
+
# warn "found last"
|
31
|
+
last_row = index
|
32
|
+
break
|
33
|
+
end
|
34
|
+
|
35
|
+
# warn "first #{first_row} last #{last_row}"
|
36
|
+
{
|
37
|
+
first_row: first_row,
|
38
|
+
last_row: last_row,
|
39
|
+
rows: csv[(first_row.nil? ? 0 : first_row)..last_row],
|
40
|
+
meta: data_meta
|
41
|
+
}
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.is_start_of_portion?(row, section_name)
|
45
|
+
return false if row.first.nil?
|
46
|
+
|
47
|
+
row.first.strip.to_s == section_name.to_s
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.is_row_empty?(row)
|
51
|
+
row.map do |f|
|
52
|
+
f.is_a?(String) ? f.strip : f
|
53
|
+
end.all?(&:nil?)
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.split_header_key_type(header_field)
|
57
|
+
field_name = ""
|
58
|
+
field_type = CAST_DEFAULT_TYPE
|
59
|
+
|
60
|
+
# warn header_field
|
61
|
+
arr = header_field.match(/\A([^\[]*)\[(.*)\]\Z/)
|
62
|
+
|
63
|
+
if arr.nil?
|
64
|
+
field_name = header_field
|
65
|
+
else
|
66
|
+
field_name = arr[1]
|
67
|
+
field_type = arr[2]
|
68
|
+
end
|
69
|
+
|
70
|
+
{
|
71
|
+
name: field_name,
|
72
|
+
type: field_type
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
CAST_DEFAULT_TYPE = "string".freeze
|
77
|
+
|
78
|
+
def self.cast_type(value, type_in_string)
|
79
|
+
return if value.nil?
|
80
|
+
|
81
|
+
type = type_in_string.downcase
|
82
|
+
|
83
|
+
case type
|
84
|
+
when "boolean"
|
85
|
+
if value == "true"
|
86
|
+
true
|
87
|
+
elsif value == "false"
|
88
|
+
false
|
89
|
+
end
|
90
|
+
when "integer"
|
91
|
+
value.to_s.strip.to_i
|
92
|
+
when "string"
|
93
|
+
value.to_s.strip
|
94
|
+
when /^array\{(.*)\}/
|
95
|
+
val_type = Regexp.last_match[1] || CAST_DEFAULT_TYPE
|
96
|
+
value.split(";").map do |v|
|
97
|
+
# warn "cast type as #{v}, #{val_type.to_s}"
|
98
|
+
cast_type(v, val_type.to_s)
|
99
|
+
end
|
100
|
+
else
|
101
|
+
value.to_s
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.parse_metadata(rows)
|
106
|
+
hash = {}
|
107
|
+
|
108
|
+
rows.each_with_index do |row, _index|
|
109
|
+
# Skip all the empty rows
|
110
|
+
next if is_row_empty?(row)
|
111
|
+
|
112
|
+
name_type = split_header_key_type(row.first)
|
113
|
+
key = name_type[:name]
|
114
|
+
type = name_type[:type]
|
115
|
+
|
116
|
+
value = cast_type(row[1], type)
|
117
|
+
hash[key] = value
|
118
|
+
end
|
119
|
+
|
120
|
+
# warn "=============================METADATA================="
|
121
|
+
# pp hash
|
122
|
+
normalize_namespaces(hash)
|
123
|
+
end
|
124
|
+
|
125
|
+
def self.parse_data(rows, data_meta)
|
126
|
+
header = []
|
127
|
+
data_name = data_meta[:name]
|
128
|
+
data_type = data_meta[:type] || "hash"
|
129
|
+
data_key = data_meta[:key]
|
130
|
+
|
131
|
+
base_structure = case data_type
|
132
|
+
when "hash"
|
133
|
+
{}
|
134
|
+
when "array"
|
135
|
+
[]
|
136
|
+
end
|
137
|
+
|
138
|
+
rows.each_with_index do |row, index|
|
139
|
+
# Assume the first column is always the key
|
140
|
+
if index == 0
|
141
|
+
# warn "row #{row}"
|
142
|
+
header = row.map do |field|
|
143
|
+
split_header_key_type(field) unless field.nil?
|
144
|
+
end.compact
|
145
|
+
|
146
|
+
data_key = header.first if data_type == "hash" && data_key.nil?
|
147
|
+
|
148
|
+
next
|
149
|
+
end
|
150
|
+
# warn "header #{header.inspect}"
|
151
|
+
|
152
|
+
# Skip all the empty rows
|
153
|
+
next if is_row_empty?(row)
|
154
|
+
|
155
|
+
# Skip if no key value
|
156
|
+
next if row[0].nil?
|
157
|
+
|
158
|
+
header_names = header.inject([]) do |acc, v|
|
159
|
+
acc << v[:name]
|
160
|
+
end
|
161
|
+
|
162
|
+
row_values = []
|
163
|
+
header.each_with_index do |h, i|
|
164
|
+
v = row[i]
|
165
|
+
v = v.strip unless v.nil?
|
166
|
+
row_values[i] = cast_type(v, h[:type])
|
167
|
+
end
|
168
|
+
|
169
|
+
k = row_values[0]
|
170
|
+
d = Hash[header_names[0..-1].zip(row_values[0..-1])]
|
171
|
+
# .transform_keys { |k| k.to_sym }
|
172
|
+
|
173
|
+
# Remove keys if they point to nil
|
174
|
+
d.keys.each do |k|
|
175
|
+
d.delete(k) if d[k].nil?
|
176
|
+
end
|
177
|
+
|
178
|
+
case data_type
|
179
|
+
when "hash"
|
180
|
+
unless base_structure[k].nil?
|
181
|
+
warn "[WARNING] there is already data inside key [#{k}] -- maybe you should set type=array?"
|
182
|
+
end
|
183
|
+
base_structure[k] = normalize_namespaces(d)
|
184
|
+
when "array"
|
185
|
+
base_structure << normalize_namespaces(d)
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
if data_name
|
190
|
+
base_structure = {
|
191
|
+
data_name => base_structure
|
192
|
+
}
|
193
|
+
end
|
194
|
+
|
195
|
+
base_structure
|
196
|
+
end
|
197
|
+
|
198
|
+
def self.convert(csv_filename)
|
199
|
+
raw_data = StructuredCsv::Common.load_csv(csv_filename)
|
200
|
+
|
201
|
+
metadata_section = get_portion(raw_data, "METADATA")
|
202
|
+
data_section = get_portion(raw_data, "DATA")
|
203
|
+
|
204
|
+
# warn '----------'
|
205
|
+
# pp data_section[:rows]
|
206
|
+
# warn '----------'
|
207
|
+
|
208
|
+
{
|
209
|
+
"metadata" => parse_metadata(metadata_section[:rows]),
|
210
|
+
"data" => parse_data(data_section[:rows], data_section[:meta])
|
211
|
+
}
|
212
|
+
end
|
213
|
+
|
214
|
+
# Structure all child hashes if the key is namespaced.
|
215
|
+
# e.g. { "hello.me" => data } becomes
|
216
|
+
# { "hello" => { "me" => data } }
|
217
|
+
#
|
218
|
+
def self.normalize_namespaces(hash)
|
219
|
+
new_hash = {}
|
220
|
+
|
221
|
+
hash.each_pair do |k, v|
|
222
|
+
# warn"k (#{k}) v (#{v})"
|
223
|
+
key_components = k.to_s.split(".")
|
224
|
+
|
225
|
+
level = new_hash
|
226
|
+
last_component = key_components.pop
|
227
|
+
key_components.each do |component|
|
228
|
+
# warn"c (#{component})"
|
229
|
+
level[component] ||= {}
|
230
|
+
level = level[component]
|
231
|
+
end
|
232
|
+
|
233
|
+
level[last_component] = v
|
234
|
+
end
|
235
|
+
|
236
|
+
new_hash
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "yaml"
|
3
|
+
require "pathname"
|
4
|
+
|
5
|
+
module StructuredCsv
|
6
|
+
module CsvJoin
|
7
|
+
def self.join(csv, section_name)
|
8
|
+
first_row = nil
|
9
|
+
last_row = -1
|
10
|
+
|
11
|
+
warn "section_name #{section_name}"
|
12
|
+
|
13
|
+
csv.each_with_index do |row, index|
|
14
|
+
if first_row.nil? && Csv2Yaml.is_start_of_portion?(row, section_name)
|
15
|
+
warn "found first"
|
16
|
+
first_row = index + 1
|
17
|
+
next
|
18
|
+
end
|
19
|
+
|
20
|
+
next unless !first_row.nil? && Csv2Yaml.is_row_empty?(row)
|
21
|
+
|
22
|
+
warn "found last"
|
23
|
+
last_row = index
|
24
|
+
break
|
25
|
+
end
|
26
|
+
|
27
|
+
warn "first #{first_row} last #{last_row}"
|
28
|
+
csv[first_row..last_row]
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.convert(csvdir, outfile)
|
32
|
+
raise "first argument must be a directory!" unless File.directory?(csvdir)
|
33
|
+
|
34
|
+
csv = CSV.open(outfile, "wb", encoding: "UTF-8")
|
35
|
+
|
36
|
+
csvfiles = Dir.glob(File.join(csvdir, "**", "*.csv")).sort
|
37
|
+
raise "directory must contain .csv files!" if csvfiles.empty?
|
38
|
+
|
39
|
+
# Assume all files use the same header structure as the first CSV file
|
40
|
+
header = []
|
41
|
+
csvheader = ""
|
42
|
+
|
43
|
+
csvfiles.each do |csvfile|
|
44
|
+
content = StructuredCsv::Common.load_csv(csvfile)
|
45
|
+
|
46
|
+
csvheader = content.shift
|
47
|
+
if header.empty?
|
48
|
+
header = ["name"] + csvheader
|
49
|
+
csv << header
|
50
|
+
end
|
51
|
+
|
52
|
+
basename = Pathname.new(csvfile).basename.sub_ext("").to_s
|
53
|
+
content.each do |filerow|
|
54
|
+
row = []
|
55
|
+
filerow.each do |value|
|
56
|
+
row << case value
|
57
|
+
when String
|
58
|
+
value.strip
|
59
|
+
else
|
60
|
+
value
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
all_empty = row.all? do |f|
|
65
|
+
f.nil? || f.empty?
|
66
|
+
end
|
67
|
+
next if all_empty
|
68
|
+
|
69
|
+
row.unshift(basename)
|
70
|
+
|
71
|
+
csv << row
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
csv.close
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
data/structured_csv.gemspec
CHANGED
@@ -8,12 +8,12 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.authors = ["Ribose Inc."]
|
9
9
|
spec.email = ["open.source@ribose.com"]
|
10
10
|
|
11
|
-
spec.summary
|
12
|
-
spec.description
|
13
|
-
spec.homepage
|
14
|
-
spec.required_ruby_version = Gem::Requirement.new(">= 2.
|
11
|
+
spec.summary = "Library to process structured CSV files"
|
12
|
+
spec.description = "Library to process structured CSV files"
|
13
|
+
spec.homepage = "https://open.ribose.com"
|
14
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.6.7")
|
15
15
|
|
16
|
-
spec.metadata["homepage_uri"]
|
16
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
17
17
|
spec.metadata["source_code_uri"] = "https://github.com/riboseinc/structured_csv"
|
18
18
|
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
19
19
|
|
@@ -31,4 +31,15 @@ Gem::Specification.new do |spec|
|
|
31
31
|
|
32
32
|
# For more information and examples about making a new gem, checkout our
|
33
33
|
# guide at: https://bundler.io/guides/creating_gem.html
|
34
|
+
|
35
|
+
spec.add_dependency "csv", "~> 3.1"
|
36
|
+
# spec.add_dependency "pathname", "~> 0.1"
|
37
|
+
spec.add_dependency "yaml", "~> 0.1"
|
38
|
+
|
39
|
+
spec.add_development_dependency "byebug", "~> 11.1"
|
40
|
+
spec.add_development_dependency "guard", "~> 2.17"
|
41
|
+
spec.add_development_dependency "guard-rspec", "~> 4.7"
|
42
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
43
|
+
spec.add_development_dependency "rspec", "~> 3.10"
|
44
|
+
spec.add_development_dependency "simplecov", "~> 0.21"
|
34
45
|
end
|