csvjoin 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 556d47fbf1bc4d3cb0821ec85ea557bddda5b2087e12d96657ca422b4d05e200
4
+ data.tar.gz: b5a0aeccd15f5f100917f1506460540b37f7c5e3bad513d3323cab249353bb21
5
+ SHA512:
6
+ metadata.gz: cf7cd627f6ec3f8def14ed499d947bdace1115b275d77fc966259e812ac679108675a8a0b6f4cb3be02469d339d15f4ae072f9171635f3aa14aedb7f014e6f95
7
+ data.tar.gz: 548d2195ae23b5aac1bf4b6eeefec69c36ae33aa7fb1517d0c8c4628f2ec4828f30a3f5493857ed172c7aed7160da355495dad46f4b2ec22d6df2bc9f65d999d
@@ -0,0 +1,19 @@
1
+ #!/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require_relative '../lib/comparator.rb'
5
+
6
+ if (ARGV.length < 2 || (ARGV.include?"-h") || (ARGV.include?"--help"))
7
+ puts "Usage: csvjoin FILE1 FILE2 [ColumnA1=ColumnA2,ColumnB1=ColumnB2]"
8
+ puts "Joins two CSV files looking for same values in specified columns. "
9
+ puts "If no columns specified by default it will use columns with the same name in both files"
10
+ else
11
+ c = CSVJoin::Comparator.new
12
+ t1 = ARGV.shift
13
+ t2 = ARGV.shift
14
+ params = ARGV.shift
15
+ c.columns_to_compare(params) if params
16
+ c.input_col_sep = ";"
17
+ res = c.compare(t1, t2)
18
+ puts res
19
+ end
@@ -0,0 +1,33 @@
1
+ # Callbacks for Diff::LCS with insert in left "<" + insert in right ">" instead of replace (!)
2
+ module Diff
3
+ module LCS
4
+ class NoReplaceDiffCallbacks
5
+ # Returns the difference set collected during the diff process.
6
+ attr_reader :diffs
7
+
8
+ #:yields self:
9
+ def initialize
10
+ @diffs = []
11
+ yield self if block_given?
12
+ end
13
+
14
+ def match(event)
15
+ @diffs << Diff::LCS::ContextChange.simplify(event)
16
+ end
17
+
18
+ def discard_a(event)
19
+ @diffs << Diff::LCS::ContextChange.simplify(event)
20
+ end
21
+
22
+ def discard_b(event)
23
+ @diffs << Diff::LCS::ContextChange.simplify(event)
24
+ end
25
+
26
+ def change(event)
27
+ discard_a(Diff::LCS::ContextChange.new("<", event.old_position, event.old_element, nil, nil))
28
+ discard_b(Diff::LCS::ContextChange.new(">", nil, nil, event.new_position, event.new_element))
29
+ # @diffs << Diff::LCS::ContextChange.simplify(event)
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,134 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'data_row'
4
+ require_relative 'callbacks'
5
+ require 'diff/lcs'
6
+
7
+
8
+ module CSVJoin
9
+ LEFT = 1 # 'left'
10
+ RIGHT = 2 # 'right'
11
+
12
+ # Compare and join two tables
13
+ class Comparator
14
+ attr_accessor :columns, :weights
15
+ attr_accessor :headers, :data, :rows
16
+ attr_accessor :input_col_sep
17
+
18
+ def initialize
19
+ @data = []
20
+ @rows = []
21
+ @empty = []
22
+ @input_col_sep = ","
23
+ end
24
+
25
+ def intuit_col_sep(line)
26
+ return "," if line.nil?
27
+
28
+ [",", ";", "\t"].max_by { |char| line.count(char) }
29
+ end
30
+
31
+ def intuit_separator(file)
32
+ File.open(file, encoding: 'bom|utf-8').each do |line|
33
+ @input_col_sep = intuit_col_sep(line)
34
+ break
35
+ end
36
+ file
37
+ end
38
+
39
+ def parse(data)
40
+ if File.exist? data
41
+ intuit_separator(data)
42
+ csv = CSV.read(data, headers: true, col_sep: @input_col_sep)
43
+ raise "Wrong CSV" if csv == []
44
+ else
45
+ csv = CSV.parse(data, headers: true, col_sep: @input_col_sep)
46
+ end
47
+ csv
48
+ end
49
+
50
+ def csv_to_talimer_rows(csv, side: 'undef')
51
+ list = []
52
+ row_columns = columns.map { |c| side.eql?(LEFT) ? c.first : c.last }
53
+
54
+ csv.each do |row|
55
+ row2 = DataRow.new(row.headers, row.fields)
56
+ row2.columns = row_columns
57
+ row2.weights = weights
58
+ row2.side = side
59
+
60
+ list << row2
61
+ end
62
+
63
+ list
64
+ end
65
+
66
+ def parse_side(source, side: nil)
67
+ @data[side] = parse(source)
68
+ @empty[side] = [*[''] * @data[side].headers.size]
69
+ end
70
+
71
+ def prepare_rows(side: nil)
72
+ @rows[side] = csv_to_talimer_rows(@data[side], side: side)
73
+ end
74
+
75
+ def prepare(source1, source2)
76
+ parse_side(source1, side: LEFT)
77
+ parse_side(source2, side: RIGHT)
78
+
79
+ set_default_column_names
80
+
81
+ prepare_rows(side: LEFT)
82
+ prepare_rows(side: RIGHT)
83
+
84
+ @headers = [*@data[LEFT].headers, "diff", *@data[RIGHT].headers]
85
+ end
86
+
87
+ # by default use columns with same names in both tables
88
+ def set_default_column_names
89
+ return unless @columns.nil?
90
+
91
+ @columns = (@data[LEFT].headers & @data[RIGHT].headers).map { |a| [a, a] }
92
+ @weights = [*[1] * @columns.size]
93
+ end
94
+
95
+ def action_verbose(action)
96
+ repl = { "!": "!==", "-": "==>", "+": "<==", "=": "===" }
97
+ raise "wrong action #{action}" unless repl.has_key? action.to_sym
98
+
99
+ return repl[action.to_sym]
100
+ end
101
+
102
+ def compare(source1, source2)
103
+ prepare(source1, source2)
104
+
105
+ sdiff = Diff::LCS.sdiff(@rows[LEFT],
106
+ @rows[RIGHT],
107
+ Diff::LCS::NoReplaceDiffCallbacks)
108
+
109
+ CSV.generate(row_sep: "\n", col_sep: @input_col_sep) do |csv|
110
+ csv << @headers
111
+ sdiff.each do |change|
112
+ row = joined_row(change)
113
+ csv << row
114
+ end
115
+ end
116
+ end
117
+
118
+ def joined_row(change)
119
+ left_row = change.old_element.nil? ? @empty[LEFT] : change.old_element.fields
120
+ right_row = change.new_element.nil? ? @empty[RIGHT] : change.new_element.fields
121
+ [*left_row, action_verbose(change.action), *right_row]
122
+ end
123
+
124
+ def columns_to_compare(cols)
125
+ @columns = []
126
+
127
+ cols.scan(/([^,:=~]+)(?:[=~])([^,:=~]+)/).each do |from, to|
128
+ @columns << [from, to]
129
+ end
130
+
131
+ @weights = [1, *[0] * (@columns.size - 1)]
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CSVJoin
4
+ VERSION = '0.6.1'
5
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+
5
+ module CSVJoin
6
+ # CSV::Row with specified important columns to compare
7
+ class DataRow < CSV::Row
8
+ attr_accessor :columns, :weights
9
+ attr_reader :side
10
+
11
+ attr_writer :side
12
+
13
+ def inspect
14
+ "#{side}:#{super}"
15
+ end
16
+
17
+ def eql?(other)
18
+ self == other
19
+ end
20
+
21
+ def hash
22
+ res = []
23
+ @weights.each_with_index do |_weight, index|
24
+ field = @columns[index]
25
+ warn("something wrong, #{inspect}, side #{side.inspect}, f'#{field}'==nil") if self[field].nil?
26
+ res << self[field]
27
+ end
28
+ return res.hash
29
+ end
30
+
31
+ #
32
+ # Returns +true+ if this row contains the same headers and fields in the
33
+ # same order as +other+.
34
+ #
35
+ def ==(other)
36
+ @columns.each_with_index do |from, index|
37
+ to = other.columns[index]
38
+ # warn "something wrong" if self[from].nil? || other[to].nil?
39
+ return false unless self[from].eql? other[to]
40
+ end
41
+ return true
42
+ end
43
+ end
44
+ end
metadata ADDED
@@ -0,0 +1,148 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: csvjoin
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.6.1
5
+ platform: ruby
6
+ authors:
7
+ - Sergey Evstegneiev
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-06-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: diff-lcs
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 13.0.0
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 13.0.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.5'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.5'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec-simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 0.2.2
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.2.2
83
+ - !ruby/object:Gem::Dependency
84
+ name: rubocop
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.49.1
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.49.1
97
+ - !ruby/object:Gem::Dependency
98
+ name: simplecov
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: 0.16.1
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: 0.16.1
111
+ description: tool to align and merge two tables containing different parts of the
112
+ same data
113
+ email:
114
+ - serg123e+github@gmail.com
115
+ executables:
116
+ - csvjoin
117
+ extensions: []
118
+ extra_rdoc_files: []
119
+ files:
120
+ - bin/csvjoin
121
+ - lib/callbacks.rb
122
+ - lib/comparator.rb
123
+ - lib/csvjoin/version.rb
124
+ - lib/data_row.rb
125
+ homepage: https://www.github.com/serg123e/csvjoin
126
+ licenses:
127
+ - MIT
128
+ metadata: {}
129
+ post_install_message:
130
+ rdoc_options: []
131
+ require_paths:
132
+ - lib
133
+ required_ruby_version: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - "~>"
136
+ - !ruby/object:Gem::Version
137
+ version: '2.0'
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ requirements: []
144
+ rubygems_version: 3.0.3
145
+ signing_key:
146
+ specification_version: 4
147
+ summary: Join 2 CSV tables by specified columns
148
+ test_files: []