sycsvpro 0.1.9 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +1 -1
- data/README.md +37 -3
- data/bin/sycsvpro +38 -2
- data/lib/sycsvpro/aggregator.rb +13 -0
- data/lib/sycsvpro/allocator.rb +12 -0
- data/lib/sycsvpro/analyzer.rb +14 -0
- data/lib/sycsvpro/join.rb +1 -1
- data/lib/sycsvpro/merger.rb +127 -0
- data/lib/sycsvpro/version.rb +1 -1
- data/lib/sycsvpro.rb +1 -0
- data/spec/sycsvpro/merger_spec.rb +105 -0
- metadata +4 -2
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -19,13 +19,14 @@ Processing of csv files. *sycsvpro* offers following functions
|
|
19
19
|
* create a table from a source file with dynamically create columns (since
|
20
20
|
version 0.1.4)
|
21
21
|
* join two file based on a joint column value (since version 0.1.7)
|
22
|
+
* merge files based on common headline columns (since version 0.1.10)
|
22
23
|
|
23
24
|
To get help type
|
24
25
|
|
25
26
|
$ sycsvpro -h
|
26
27
|
|
27
|
-
In the following examples we assume the following files 'machines.csv'
|
28
|
-
'region.csv'
|
28
|
+
In the following examples we assume the following files 'machines.csv',
|
29
|
+
'region.csv' and revenue.csv
|
29
30
|
|
30
31
|
```
|
31
32
|
customer;machine;control;drive;motor;date;contract;price;c-id
|
@@ -44,6 +45,13 @@ R3;US;345
|
|
44
45
|
R4;CA;456
|
45
46
|
```
|
46
47
|
|
48
|
+
```
|
49
|
+
2010;2011;2012;2013;2014;customer
|
50
|
+
50;100;150;100;200;hello
|
51
|
+
100;50;10;1000;20;indix
|
52
|
+
2000;250;300;3000;chiro
|
53
|
+
```
|
54
|
+
|
47
55
|
Analyze
|
48
56
|
-------
|
49
57
|
Analyze the content of the provided file *in.csv*
|
@@ -220,7 +228,27 @@ on streak.
|
|
220
228
|
-i "COUNTRY,REGION"
|
221
229
|
-j "3=8;3=10"
|
222
230
|
|
223
|
-
|
231
|
+
Merge
|
232
|
+
-----
|
233
|
+
Merge files machine_count.csv and revenue.csv based on the year columns.
|
234
|
+
|
235
|
+
$ sycsvpro -o out.csv merge machines.csv,revenue.csv
|
236
|
+
-h "2010,2013,2014"
|
237
|
+
-k "0,5"
|
238
|
+
-s "(\\d{4}),(\\d{4})"
|
239
|
+
|
240
|
+
This will create the out.csv
|
241
|
+
|
242
|
+
```
|
243
|
+
;2010;2013;2014
|
244
|
+
hello;1;0;0
|
245
|
+
indix;1;0;0
|
246
|
+
chiro;0;1;0
|
247
|
+
hello;50;100;200
|
248
|
+
indix;100;1000;20
|
249
|
+
chiro;2000;300;3000
|
250
|
+
```
|
251
|
+
|
224
252
|
Sort
|
225
253
|
----
|
226
254
|
Sort rows on specified columns as an example sort rows based on customer
|
@@ -439,6 +467,12 @@ Version 0.1.9
|
|
439
467
|
* When creating columns dynamically they are in arbitrary sequence. You can now
|
440
468
|
provide a switch `sort: "2"` which will sort the header from column 2 on.
|
441
469
|
|
470
|
+
Version 0.1.10
|
471
|
+
--------------
|
472
|
+
* It is now possible to merge multiple files based on common headline columns
|
473
|
+
* Fix ~/.syc/sycsvpro system directory creation when no .syc directory is
|
474
|
+
available
|
475
|
+
|
442
476
|
Installation
|
443
477
|
============
|
444
478
|
[](http://badge.fury.io/rb/sycsvpro)
|
data/bin/sycsvpro
CHANGED
@@ -21,7 +21,7 @@ row_regex = %r{
|
|
21
21
|
sycsvpro_directory = File.expand_path("~/.syc/sycsvpro")
|
22
22
|
|
23
23
|
unless File.exists? sycsvpro_directory
|
24
|
-
|
24
|
+
FileUtils.mkdir_p sycsvpro_directory
|
25
25
|
end
|
26
26
|
|
27
27
|
# Script directory
|
@@ -427,6 +427,37 @@ command :join do |c|
|
|
427
427
|
end
|
428
428
|
end
|
429
429
|
|
430
|
+
desc 'Merge multiple files based on a common column value with a key value at '+
|
431
|
+
'the first column of a row'
|
432
|
+
arg_name 'FILE1 FILE2 ...'
|
433
|
+
command :merge do |c|
|
434
|
+
c.desc 'The key columns in the source files, which contains the columns to '+
|
435
|
+
'be inserted into the outfile as first row column'
|
436
|
+
c.arg_name '0,3'
|
437
|
+
c.flag [:k, :key], :must_match => /^\d+(?:,\d+)*/
|
438
|
+
|
439
|
+
c.desc 'Header columns to be used as identifires for the columns of the '+
|
440
|
+
'merging files'
|
441
|
+
c.arg_name 'COL1,COL2,COL3'
|
442
|
+
c.flag [:h, :header]
|
443
|
+
|
444
|
+
c.desc 'Header column patterns to be used as the identifier of the columns '+
|
445
|
+
'of the files to be merged into the outfile'
|
446
|
+
c.arg_name 'PATTERN1,PATTERN2'
|
447
|
+
c.flag [:s, :source_header]
|
448
|
+
|
449
|
+
c.action do |global_options,options,args|
|
450
|
+
merge = Sycsvpro::Merger.new(outfile: global_options[:o],
|
451
|
+
files: args[0],
|
452
|
+
header: options[:h],
|
453
|
+
source_header: options[:s],
|
454
|
+
key: options[:k])
|
455
|
+
print 'Merging...'
|
456
|
+
merge.execute
|
457
|
+
print 'done'
|
458
|
+
end
|
459
|
+
end
|
460
|
+
|
430
461
|
desc 'Sort rows based on column values. It is possible to sort on multiple '+
|
431
462
|
'columns'
|
432
463
|
command :sort do |c|
|
@@ -584,11 +615,16 @@ pre do |global,command,options,args|
|
|
584
615
|
when :aggregate, :allocate, :calc, :collect, :count, :extract, :map, :sort
|
585
616
|
help_now! "You need to provide an input file '-f FILE'" if global[:f].nil?
|
586
617
|
help_now! "You need to provide a result file '-o OUT_FILE'" if global[:o].nil?
|
618
|
+
when :merge
|
619
|
+
help_now! "You need to provide a result file '-o OUT_FILE'" if global[:o].nil?
|
587
620
|
end
|
588
621
|
|
589
622
|
count = 0
|
590
623
|
|
591
|
-
unless command.name == :edit or
|
624
|
+
unless command.name == :edit or
|
625
|
+
command.name == :execute or
|
626
|
+
command.name == :list or
|
627
|
+
command.name == :merge
|
592
628
|
analyzer = Sycsvpro::Analyzer.new(global[:f])
|
593
629
|
result = analyzer.result
|
594
630
|
count = result.row_count
|
data/lib/sycsvpro/aggregator.rb
CHANGED
@@ -7,6 +7,19 @@ module Sycsvpro
|
|
7
7
|
|
8
8
|
# An Aggregator counts specified row values and adds a sum to the end of
|
9
9
|
# the row
|
10
|
+
#
|
11
|
+
# in.csv
|
12
|
+
#
|
13
|
+
# | Customer | 2013 | 2014 |
|
14
|
+
# | A | A1 | |
|
15
|
+
# | B | B1 | B16 |
|
16
|
+
# | A | A3 | A7 |
|
17
|
+
#
|
18
|
+
# out.csv
|
19
|
+
#
|
20
|
+
# | Customer | 2013 | 2014 | Sum |
|
21
|
+
# | A | 2 | 1 | 3 |
|
22
|
+
# | B | 1 | 1 | 2 |
|
10
23
|
class Aggregator
|
11
24
|
|
12
25
|
include Dsl
|
data/lib/sycsvpro/allocator.rb
CHANGED
@@ -2,6 +2,18 @@
|
|
2
2
|
module Sycsvpro
|
3
3
|
|
4
4
|
# Allocates columns to a key column
|
5
|
+
#
|
6
|
+
# infile.csv
|
7
|
+
#
|
8
|
+
# | Name | Product |
|
9
|
+
# | A | X1 |
|
10
|
+
# | B | Y2 |
|
11
|
+
# | A | W10 |
|
12
|
+
#
|
13
|
+
# outfile.csv
|
14
|
+
#
|
15
|
+
# | A | X1 | W10 |
|
16
|
+
# | B | Y2 | |
|
5
17
|
class Allocator
|
6
18
|
|
7
19
|
# File from that values are read
|
data/lib/sycsvpro/analyzer.rb
CHANGED
@@ -5,6 +5,20 @@ module Sycsvpro
|
|
5
5
|
Result = Struct.new(:cols, :col_count, :row_count, :sample_row)
|
6
6
|
|
7
7
|
# Analyzes the file structure
|
8
|
+
#
|
9
|
+
# | Name | C1 | C2 |
|
10
|
+
# | A | a | b |
|
11
|
+
#
|
12
|
+
# 3 columns: ["Name", "C1", "C2"]
|
13
|
+
# 2 rows
|
14
|
+
#
|
15
|
+
# Row sample data:
|
16
|
+
# A;b;c
|
17
|
+
#
|
18
|
+
# Column index: Column name | Column sample value
|
19
|
+
# 0: Name | A
|
20
|
+
# 1: C1 | a
|
21
|
+
# 2: C2 | b
|
8
22
|
class Analyzer
|
9
23
|
|
10
24
|
# File that is analyzed
|
data/lib/sycsvpro/join.rb
CHANGED
@@ -0,0 +1,127 @@
|
|
1
|
+
# Operating csv files
|
2
|
+
module Sycsvpro
|
3
|
+
|
4
|
+
# Merge files based on common header columns
|
5
|
+
#
|
6
|
+
# file1.csv
|
7
|
+
#
|
8
|
+
# | | 2010 | 2011 | 2012 | 2013 |
|
9
|
+
# | --- | ---- | ---- | ---- | ---- |
|
10
|
+
# | SP | 20 | 30 | 40 | 50 |
|
11
|
+
# | RP | 30 | 40 | 50 | 60 |
|
12
|
+
#
|
13
|
+
# file2.csv
|
14
|
+
#
|
15
|
+
# | | 2010 | 2011 | 2012 |
|
16
|
+
# | --- | ---- | ---- | ---- |
|
17
|
+
# | M | m1 | m2 | m3 |
|
18
|
+
# | N | n1 | n2 | n3 |
|
19
|
+
#
|
20
|
+
# merging restults in
|
21
|
+
#
|
22
|
+
# merge.csv
|
23
|
+
#
|
24
|
+
# | | 2010 | 2011 | 2012 | 2013 |
|
25
|
+
# | --- | ---- | ---- | ---- | ---- |
|
26
|
+
# | SP | 20 | 30 | 40 | 50 |
|
27
|
+
# | RP | 30 | 40 | 50 | 60 |
|
28
|
+
# | M | m1 | m2 | m3 | |
|
29
|
+
# | N | n1 | n2 | n3 | |
|
30
|
+
#
|
31
|
+
class Merger
|
32
|
+
|
33
|
+
include Dsl
|
34
|
+
|
35
|
+
# file to that the result is written
|
36
|
+
attr_reader :outfile
|
37
|
+
# header patterns to be used to identify merge columns
|
38
|
+
attr_reader :source_header
|
39
|
+
# header columns
|
40
|
+
attr_reader :header_cols
|
41
|
+
# value that is used as first of column of a row
|
42
|
+
attr_reader :key
|
43
|
+
# files to be merged based on header columns
|
44
|
+
attr_reader :files
|
45
|
+
# file to that the result is written to
|
46
|
+
attr_reader :outfile
|
47
|
+
|
48
|
+
# Merge files based on common header columns
|
49
|
+
#
|
50
|
+
# :call-seq:
|
51
|
+
# Sycsvpro::Merger.new(outfile: "out.csv",
|
52
|
+
# files: "file1.csv,file2.csv,filen.csv",
|
53
|
+
# header: "2010,2011,2012,2013,2014",
|
54
|
+
# source_header: "(\\d{4}/),(/\\d{4}/)",
|
55
|
+
# key: "0,0").execute
|
56
|
+
#
|
57
|
+
# Semantics
|
58
|
+
# =========
|
59
|
+
# Merges the files file1.csv, file2.csv ... based on the header columns
|
60
|
+
# 2010, 2011, 2012, 2013 and 2014 where columns are identified by the
|
61
|
+
# regex /(\d{4})/. The first column in a row is column 0 of the file1.csv
|
62
|
+
# and so on.
|
63
|
+
#
|
64
|
+
# outfile:: result is written to the outfile
|
65
|
+
# files:: list of files that get merged. In the result file the files are
|
66
|
+
# inserted in the sequence they are provided
|
67
|
+
# header:: header of the result file and key for assigning column values
|
68
|
+
# from source files to result file
|
69
|
+
# source_header:: pattern for each header of the source file to determine
|
70
|
+
# the column. The pattern is a regex without the enclosing slashes '/'
|
71
|
+
# key:: first column value from the source file that is used as first
|
72
|
+
# column in the target file
|
73
|
+
def initialize(options = {})
|
74
|
+
@outfile = options[:outfile]
|
75
|
+
@header_cols = options[:header].split(',')
|
76
|
+
@source_header = options[:source_header].split(',')
|
77
|
+
@key = options[:key].split(',')
|
78
|
+
@files = options[:files].split(',')
|
79
|
+
end
|
80
|
+
|
81
|
+
# Merges the files based on the provided parameters
|
82
|
+
def execute
|
83
|
+
File.open(outfile, 'w') do |out|
|
84
|
+
out.puts ";#{header_cols.join(';')}"
|
85
|
+
files.each do |file|
|
86
|
+
@current_key = @key.shift
|
87
|
+
@current_source_header = @source_header.shift
|
88
|
+
processed_header = false
|
89
|
+
File.open(file).each_with_index do |line, index|
|
90
|
+
next if line.chomp.empty?
|
91
|
+
|
92
|
+
unless processed_header
|
93
|
+
create_file_header unstring(line).split(';')
|
94
|
+
processed_header = true
|
95
|
+
next
|
96
|
+
end
|
97
|
+
|
98
|
+
out.puts create_line unstring(line).split(';')
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
# create a filter for the columns that match the header filter
|
107
|
+
def create_file_header(columns)
|
108
|
+
columns.each_with_index do |c,i|
|
109
|
+
next if i == @current_key
|
110
|
+
columns[i] = c.scan(Regexp.new(@current_source_header)).flatten[0]
|
111
|
+
end
|
112
|
+
|
113
|
+
@file_header = [@current_key.to_i]
|
114
|
+
header_cols.each do |h|
|
115
|
+
@file_header << columns.index(h)
|
116
|
+
end
|
117
|
+
@file_header.compact!
|
118
|
+
end
|
119
|
+
|
120
|
+
# create a line filtered by the file_header
|
121
|
+
def create_line(columns)
|
122
|
+
columns.values_at(*@file_header).join(';')
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
data/lib/sycsvpro/version.rb
CHANGED
data/lib/sycsvpro.rb
CHANGED
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'sycsvpro/merger.rb'
|
2
|
+
|
3
|
+
module Sycsvpro
|
4
|
+
|
5
|
+
describe Merger do
|
6
|
+
|
7
|
+
before do
|
8
|
+
@file1 = File.join(File.dirname(__FILE__), "files/merge1.csv")
|
9
|
+
@file2 = File.join(File.dirname(__FILE__), "files/merge2.csv")
|
10
|
+
@outfile = File.join(File.dirname(__FILE__), "files/merged.csv")
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should merge two files" do
|
14
|
+
header = "2010,2011,2012,2014"
|
15
|
+
key = "0,0"
|
16
|
+
source_header = "(\\d{4}),(\\d{4})"
|
17
|
+
|
18
|
+
Sycsvpro::Merger.new(outfile: @outfile,
|
19
|
+
files: "#{@file1},#{@file2}",
|
20
|
+
header: header,
|
21
|
+
key: key,
|
22
|
+
source_header: source_header).execute
|
23
|
+
|
24
|
+
result = [ ";2010;2011;2012;2014",
|
25
|
+
"SP;20;30;40;60",
|
26
|
+
"RP;30;40;50;70",
|
27
|
+
"MP;40;50;60;80",
|
28
|
+
"NP;50;60;70;90",
|
29
|
+
"M;m1;m2;m3",
|
30
|
+
"N;n1;n2;n3",
|
31
|
+
"O;o1;;o3", ]
|
32
|
+
|
33
|
+
rows = 0
|
34
|
+
|
35
|
+
File.open(@outfile).each_with_index do |row, index|
|
36
|
+
row.chomp.should eq result[index]
|
37
|
+
rows += 1
|
38
|
+
end
|
39
|
+
|
40
|
+
rows.should eq result.size
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should merge two files with differnt key columns in the middle" do
|
44
|
+
header = "2010,2011,2012,2014"
|
45
|
+
key = "0,3"
|
46
|
+
source_header = "(\\d{4}),(\\d{4})"
|
47
|
+
|
48
|
+
Sycsvpro::Merger.new(outfile: @outfile,
|
49
|
+
files: "#{@file1},#{@file2}",
|
50
|
+
header: header,
|
51
|
+
key: key,
|
52
|
+
source_header: source_header).execute
|
53
|
+
|
54
|
+
result = [ ";2010;2011;2012;2014",
|
55
|
+
"SP;20;30;40;60",
|
56
|
+
"RP;30;40;50;70",
|
57
|
+
"MP;40;50;60;80",
|
58
|
+
"NP;50;60;70;90",
|
59
|
+
"MO;m1;m2;m3",
|
60
|
+
"NO;n1;n2;n3",
|
61
|
+
"OO;o1;;o3", ]
|
62
|
+
|
63
|
+
rows = 0
|
64
|
+
|
65
|
+
File.open(@outfile).each_with_index do |row, index|
|
66
|
+
row.chomp.should eq result[index]
|
67
|
+
rows += 1
|
68
|
+
end
|
69
|
+
|
70
|
+
rows.should eq result.size
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should merge two files with differnt key columns at the end" do
|
74
|
+
header = "2010,2011,2012,2014"
|
75
|
+
key = "0,6"
|
76
|
+
source_header = "(\\d{4}),(\\d{4})"
|
77
|
+
|
78
|
+
Sycsvpro::Merger.new(outfile: @outfile,
|
79
|
+
files: "#{@file1},#{@file2}",
|
80
|
+
header: header,
|
81
|
+
key: key,
|
82
|
+
source_header: source_header).execute
|
83
|
+
|
84
|
+
result = [ ";2010;2011;2012;2014",
|
85
|
+
"SP;20;30;40;60",
|
86
|
+
"RP;30;40;50;70",
|
87
|
+
"MP;40;50;60;80",
|
88
|
+
"NP;50;60;70;90",
|
89
|
+
"MI;m1;m2;m3",
|
90
|
+
"NI;n1;n2;n3",
|
91
|
+
"OI;o1;;o3", ]
|
92
|
+
|
93
|
+
rows = 0
|
94
|
+
|
95
|
+
File.open(@outfile).each_with_index do |row, index|
|
96
|
+
row.chomp.should eq result[index]
|
97
|
+
rows += 1
|
98
|
+
end
|
99
|
+
|
100
|
+
rows.should eq result.size
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sycsvpro
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.10
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-
|
12
|
+
date: 2014-07-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -144,6 +144,7 @@ files:
|
|
144
144
|
- lib/sycsvpro/inserter.rb
|
145
145
|
- lib/sycsvpro/join.rb
|
146
146
|
- lib/sycsvpro/mapper.rb
|
147
|
+
- lib/sycsvpro/merger.rb
|
147
148
|
- lib/sycsvpro/profiler.rb
|
148
149
|
- lib/sycsvpro/row_filter.rb
|
149
150
|
- lib/sycsvpro/script_creator.rb
|
@@ -167,6 +168,7 @@ files:
|
|
167
168
|
- spec/sycsvpro/inserter_spec.rb
|
168
169
|
- spec/sycsvpro/join_spec.rb
|
169
170
|
- spec/sycsvpro/mapper_spec.rb
|
171
|
+
- spec/sycsvpro/merger_spec.rb
|
170
172
|
- spec/sycsvpro/profiler_spec.rb
|
171
173
|
- spec/sycsvpro/row_filter_spec.rb
|
172
174
|
- spec/sycsvpro/script_list_spec.rb
|