csvjoin 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/csvjoin +19 -0
- data/lib/callbacks.rb +33 -0
- data/lib/comparator.rb +134 -0
- data/lib/csvjoin/version.rb +5 -0
- data/lib/data_row.rb +44 -0
- metadata +148 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 556d47fbf1bc4d3cb0821ec85ea557bddda5b2087e12d96657ca422b4d05e200
|
4
|
+
data.tar.gz: b5a0aeccd15f5f100917f1506460540b37f7c5e3bad513d3323cab249353bb21
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cf7cd627f6ec3f8def14ed499d947bdace1115b275d77fc966259e812ac679108675a8a0b6f4cb3be02469d339d15f4ae072f9171635f3aa14aedb7f014e6f95
|
7
|
+
data.tar.gz: 548d2195ae23b5aac1bf4b6eeefec69c36ae33aa7fb1517d0c8c4628f2ec4828f30a3f5493857ed172c7aed7160da355495dad46f4b2ec22d6df2bc9f65d999d
|
data/bin/csvjoin
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require_relative '../lib/comparator.rb'
|
5
|
+
|
6
|
+
if (ARGV.length < 2 || (ARGV.include?"-h") || (ARGV.include?"--help"))
|
7
|
+
puts "Usage: csvjoin FILE1 FILE2 [ColumnA1=ColumnA2,ColumnB1=ColumnB2]"
|
8
|
+
puts "Joins two CSV files looking for same values in specified columns. "
|
9
|
+
puts "If no columns specified by default it will use columns with the same name in both files"
|
10
|
+
else
|
11
|
+
c = CSVJoin::Comparator.new
|
12
|
+
t1 = ARGV.shift
|
13
|
+
t2 = ARGV.shift
|
14
|
+
params = ARGV.shift
|
15
|
+
c.columns_to_compare(params) if params
|
16
|
+
c.input_col_sep = ";"
|
17
|
+
res = c.compare(t1, t2)
|
18
|
+
puts res
|
19
|
+
end
|
data/lib/callbacks.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# Callbacks for Diff::LCS with insert in left "<" + insert in right ">" instead of replace (!)
|
2
|
+
module Diff
|
3
|
+
module LCS
|
4
|
+
class NoReplaceDiffCallbacks
|
5
|
+
# Returns the difference set collected during the diff process.
|
6
|
+
attr_reader :diffs
|
7
|
+
|
8
|
+
#:yields self:
|
9
|
+
def initialize
|
10
|
+
@diffs = []
|
11
|
+
yield self if block_given?
|
12
|
+
end
|
13
|
+
|
14
|
+
def match(event)
|
15
|
+
@diffs << Diff::LCS::ContextChange.simplify(event)
|
16
|
+
end
|
17
|
+
|
18
|
+
def discard_a(event)
|
19
|
+
@diffs << Diff::LCS::ContextChange.simplify(event)
|
20
|
+
end
|
21
|
+
|
22
|
+
def discard_b(event)
|
23
|
+
@diffs << Diff::LCS::ContextChange.simplify(event)
|
24
|
+
end
|
25
|
+
|
26
|
+
def change(event)
|
27
|
+
discard_a(Diff::LCS::ContextChange.new("<", event.old_position, event.old_element, nil, nil))
|
28
|
+
discard_b(Diff::LCS::ContextChange.new(">", nil, nil, event.new_position, event.new_element))
|
29
|
+
# @diffs << Diff::LCS::ContextChange.simplify(event)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/comparator.rb
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'data_row'
|
4
|
+
require_relative 'callbacks'
|
5
|
+
require 'diff/lcs'
|
6
|
+
|
7
|
+
|
8
|
+
module CSVJoin
|
9
|
+
LEFT = 1 # 'left'
|
10
|
+
RIGHT = 2 # 'right'
|
11
|
+
|
12
|
+
# Compare and join two tables
|
13
|
+
class Comparator
|
14
|
+
attr_accessor :columns, :weights
|
15
|
+
attr_accessor :headers, :data, :rows
|
16
|
+
attr_accessor :input_col_sep
|
17
|
+
|
18
|
+
def initialize
|
19
|
+
@data = []
|
20
|
+
@rows = []
|
21
|
+
@empty = []
|
22
|
+
@input_col_sep = ","
|
23
|
+
end
|
24
|
+
|
25
|
+
def intuit_col_sep(line)
|
26
|
+
return "," if line.nil?
|
27
|
+
|
28
|
+
[",", ";", "\t"].max_by { |char| line.count(char) }
|
29
|
+
end
|
30
|
+
|
31
|
+
def intuit_separator(file)
|
32
|
+
File.open(file, encoding: 'bom|utf-8').each do |line|
|
33
|
+
@input_col_sep = intuit_col_sep(line)
|
34
|
+
break
|
35
|
+
end
|
36
|
+
file
|
37
|
+
end
|
38
|
+
|
39
|
+
def parse(data)
|
40
|
+
if File.exist? data
|
41
|
+
intuit_separator(data)
|
42
|
+
csv = CSV.read(data, headers: true, col_sep: @input_col_sep)
|
43
|
+
raise "Wrong CSV" if csv == []
|
44
|
+
else
|
45
|
+
csv = CSV.parse(data, headers: true, col_sep: @input_col_sep)
|
46
|
+
end
|
47
|
+
csv
|
48
|
+
end
|
49
|
+
|
50
|
+
def csv_to_talimer_rows(csv, side: 'undef')
|
51
|
+
list = []
|
52
|
+
row_columns = columns.map { |c| side.eql?(LEFT) ? c.first : c.last }
|
53
|
+
|
54
|
+
csv.each do |row|
|
55
|
+
row2 = DataRow.new(row.headers, row.fields)
|
56
|
+
row2.columns = row_columns
|
57
|
+
row2.weights = weights
|
58
|
+
row2.side = side
|
59
|
+
|
60
|
+
list << row2
|
61
|
+
end
|
62
|
+
|
63
|
+
list
|
64
|
+
end
|
65
|
+
|
66
|
+
def parse_side(source, side: nil)
|
67
|
+
@data[side] = parse(source)
|
68
|
+
@empty[side] = [*[''] * @data[side].headers.size]
|
69
|
+
end
|
70
|
+
|
71
|
+
def prepare_rows(side: nil)
|
72
|
+
@rows[side] = csv_to_talimer_rows(@data[side], side: side)
|
73
|
+
end
|
74
|
+
|
75
|
+
def prepare(source1, source2)
|
76
|
+
parse_side(source1, side: LEFT)
|
77
|
+
parse_side(source2, side: RIGHT)
|
78
|
+
|
79
|
+
set_default_column_names
|
80
|
+
|
81
|
+
prepare_rows(side: LEFT)
|
82
|
+
prepare_rows(side: RIGHT)
|
83
|
+
|
84
|
+
@headers = [*@data[LEFT].headers, "diff", *@data[RIGHT].headers]
|
85
|
+
end
|
86
|
+
|
87
|
+
# by default use columns with same names in both tables
|
88
|
+
def set_default_column_names
|
89
|
+
return unless @columns.nil?
|
90
|
+
|
91
|
+
@columns = (@data[LEFT].headers & @data[RIGHT].headers).map { |a| [a, a] }
|
92
|
+
@weights = [*[1] * @columns.size]
|
93
|
+
end
|
94
|
+
|
95
|
+
def action_verbose(action)
|
96
|
+
repl = { "!": "!==", "-": "==>", "+": "<==", "=": "===" }
|
97
|
+
raise "wrong action #{action}" unless repl.has_key? action.to_sym
|
98
|
+
|
99
|
+
return repl[action.to_sym]
|
100
|
+
end
|
101
|
+
|
102
|
+
def compare(source1, source2)
|
103
|
+
prepare(source1, source2)
|
104
|
+
|
105
|
+
sdiff = Diff::LCS.sdiff(@rows[LEFT],
|
106
|
+
@rows[RIGHT],
|
107
|
+
Diff::LCS::NoReplaceDiffCallbacks)
|
108
|
+
|
109
|
+
CSV.generate(row_sep: "\n", col_sep: @input_col_sep) do |csv|
|
110
|
+
csv << @headers
|
111
|
+
sdiff.each do |change|
|
112
|
+
row = joined_row(change)
|
113
|
+
csv << row
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def joined_row(change)
|
119
|
+
left_row = change.old_element.nil? ? @empty[LEFT] : change.old_element.fields
|
120
|
+
right_row = change.new_element.nil? ? @empty[RIGHT] : change.new_element.fields
|
121
|
+
[*left_row, action_verbose(change.action), *right_row]
|
122
|
+
end
|
123
|
+
|
124
|
+
def columns_to_compare(cols)
|
125
|
+
@columns = []
|
126
|
+
|
127
|
+
cols.scan(/([^,:=~]+)(?:[=~])([^,:=~]+)/).each do |from, to|
|
128
|
+
@columns << [from, to]
|
129
|
+
end
|
130
|
+
|
131
|
+
@weights = [1, *[0] * (@columns.size - 1)]
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
data/lib/data_row.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
module CSVJoin
|
6
|
+
# CSV::Row with specified important columns to compare
|
7
|
+
class DataRow < CSV::Row
|
8
|
+
attr_accessor :columns, :weights
|
9
|
+
attr_reader :side
|
10
|
+
|
11
|
+
attr_writer :side
|
12
|
+
|
13
|
+
def inspect
|
14
|
+
"#{side}:#{super}"
|
15
|
+
end
|
16
|
+
|
17
|
+
def eql?(other)
|
18
|
+
self == other
|
19
|
+
end
|
20
|
+
|
21
|
+
def hash
|
22
|
+
res = []
|
23
|
+
@weights.each_with_index do |_weight, index|
|
24
|
+
field = @columns[index]
|
25
|
+
warn("something wrong, #{inspect}, side #{side.inspect}, f'#{field}'==nil") if self[field].nil?
|
26
|
+
res << self[field]
|
27
|
+
end
|
28
|
+
return res.hash
|
29
|
+
end
|
30
|
+
|
31
|
+
#
|
32
|
+
# Returns +true+ if this row contains the same headers and fields in the
|
33
|
+
# same order as +other+.
|
34
|
+
#
|
35
|
+
def ==(other)
|
36
|
+
@columns.each_with_index do |from, index|
|
37
|
+
to = other.columns[index]
|
38
|
+
# warn "something wrong" if self[from].nil? || other[to].nil?
|
39
|
+
return false unless self[from].eql? other[to]
|
40
|
+
end
|
41
|
+
return true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
metadata
ADDED
@@ -0,0 +1,148 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: csvjoin
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.6.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Sergey Evstegneiev
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-06-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: diff-lcs
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 13.0.0
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 13.0.0
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '3.5'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '3.5'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec-simplecov
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 0.2.2
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.2.2
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rubocop
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.49.1
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.49.1
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: simplecov
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 0.16.1
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 0.16.1
|
111
|
+
description: tool to align and merge two tables containing different parts of the
|
112
|
+
same data
|
113
|
+
email:
|
114
|
+
- serg123e+github@gmail.com
|
115
|
+
executables:
|
116
|
+
- csvjoin
|
117
|
+
extensions: []
|
118
|
+
extra_rdoc_files: []
|
119
|
+
files:
|
120
|
+
- bin/csvjoin
|
121
|
+
- lib/callbacks.rb
|
122
|
+
- lib/comparator.rb
|
123
|
+
- lib/csvjoin/version.rb
|
124
|
+
- lib/data_row.rb
|
125
|
+
homepage: https://www.github.com/serg123e/csvjoin
|
126
|
+
licenses:
|
127
|
+
- MIT
|
128
|
+
metadata: {}
|
129
|
+
post_install_message:
|
130
|
+
rdoc_options: []
|
131
|
+
require_paths:
|
132
|
+
- lib
|
133
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
134
|
+
requirements:
|
135
|
+
- - "~>"
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: '2.0'
|
138
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
139
|
+
requirements:
|
140
|
+
- - ">="
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: '0'
|
143
|
+
requirements: []
|
144
|
+
rubygems_version: 3.0.3
|
145
|
+
signing_key:
|
146
|
+
specification_version: 4
|
147
|
+
summary: Join 2 CSV tables by specified columns
|
148
|
+
test_files: []
|