group_by_match_type 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +65 -0
- data/exe/group_by_match_type +6 -0
- data/lib/group_by_match_type/cli.rb +57 -0
- data/lib/group_by_match_type/matcher.rb +83 -0
- data/lib/group_by_match_type/union_find.rb +27 -0
- data/lib/group_by_match_type/version.rb +5 -0
- data/lib/group_by_match_type.rb +10 -0
- metadata +126 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 5202086856b91a26669f161ea8edc6982fd6dd8fc6e62e375e4bd334727de16c
|
4
|
+
data.tar.gz: 4b3c5911103ed26e80ffb6ddff7bc77dd3c9b83d7ba19ab0a787c652def1f407
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f5167852ae2269c04129ad5a859af81a2e2addaf90db7db1ca72cbf417df91dac9327f263de05e9a8151e9be8dbd587551693b69875558caf8dec4cdb16ffa30
|
7
|
+
data.tar.gz: 27809eb8566ecc18b886fa35fa300725323db17bb51bda0153c72c99df36ba4acb675a644213a1f756421d95da98700774d67f2855501b1ecf173a00871b1fa5
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2025 TODO: Write your name
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
# Group By Match Type
|
2
|
+
|
3
|
+
A Ruby gem for identifying and grouping CSV records based on matching email or phone number columns.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Install the gem by running:
|
8
|
+
|
9
|
+
```bash
|
10
|
+
gem install group_by_match_type
|
11
|
+
```
|
12
|
+
|
13
|
+
Or add this line to your application's Gemfile:
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
gem 'group_by_match_type'
|
17
|
+
```
|
18
|
+
|
19
|
+
And then execute:
|
20
|
+
|
21
|
+
```bash
|
22
|
+
bundle install
|
23
|
+
```
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
```bash
|
28
|
+
group_by_match_type INPUT_FILE MATCHING_TYPE [OUTPUT_FILE]
|
29
|
+
```
|
30
|
+
|
31
|
+
Available matching types:
|
32
|
+
- `same_email`: Groups records with matching email addresses
|
33
|
+
- `same_phone`: Groups records with matching phone numbers
|
34
|
+
- `same_email_or_phone`: Groups records that share either email or phone number
|
35
|
+
|
36
|
+
### Examples:
|
37
|
+
```bash
|
38
|
+
# Match by email, default output
|
39
|
+
group_by_match_type contacts.csv same_email
|
40
|
+
|
41
|
+
# Match by phone, default output
|
42
|
+
group_by_match_type contacts.csv same_phone
|
43
|
+
|
44
|
+
# Match by either email or phone, default output
|
45
|
+
group_by_match_type contacts.csv same_email_or_phone
|
46
|
+
|
47
|
+
# Specify a custom output file location
|
48
|
+
group_by_match_type contacts.csv same_email ~/Downloads/my_grouped_contacts.csv
|
49
|
+
```
|
50
|
+
|
51
|
+
## Output
|
52
|
+
|
53
|
+
The gem creates a new CSV file with all the original columns plus a new "group_id" column at the end. Records that are considered to be the same person based on the provided matching_type will have the same group_id. If you specify an `OUTPUT_FILE`, the grouped CSV will be written to that location; otherwise, it will be written as `*_grouped.csv` next to your input file.
|
54
|
+
|
55
|
+
## Development
|
56
|
+
|
57
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
58
|
+
|
59
|
+
## Contributing
|
60
|
+
|
61
|
+
Bug reports and pull requests are welcome on GitHub. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](CODE_OF_CONDUCT.md).
|
62
|
+
|
63
|
+
## License
|
64
|
+
|
65
|
+
The gem is available as open source under the terms of the [MIT License](LICENSE.txt).
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require_relative 'matcher'
|
3
|
+
|
4
|
+
module GroupByMatchType
|
5
|
+
class CLI
|
6
|
+
VALID_TYPES = %w[same_email same_phone same_email_or_phone]
|
7
|
+
|
8
|
+
def self.start(args)
|
9
|
+
if args.empty? || args.include?('-h') || args.include?('--help')
|
10
|
+
show_help
|
11
|
+
exit(args.empty? ? 1 : 0)
|
12
|
+
end
|
13
|
+
|
14
|
+
if args.length < 2 || args.length > 3
|
15
|
+
puts 'Error: Please provide the input file, matching type, and optionally an output file path'
|
16
|
+
show_help
|
17
|
+
exit 1
|
18
|
+
end
|
19
|
+
|
20
|
+
input_file, matching_type, output_file = args
|
21
|
+
|
22
|
+
unless VALID_TYPES.include?(matching_type)
|
23
|
+
puts "Error: Invalid matching type '#{matching_type}'"
|
24
|
+
show_help
|
25
|
+
exit 1
|
26
|
+
end
|
27
|
+
|
28
|
+
unless File.exist?(input_file)
|
29
|
+
puts "Error: File '#{input_file}' does not exist"
|
30
|
+
exit 1
|
31
|
+
end
|
32
|
+
|
33
|
+
begin
|
34
|
+
matcher = Matcher.new(input_file, matching_type)
|
35
|
+
output = matcher.process(output_file)
|
36
|
+
puts "Processing complete. Output written to: #{output}"
|
37
|
+
rescue StandardError => e
|
38
|
+
puts "Error: #{e.message}"
|
39
|
+
exit 1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.show_help
|
44
|
+
puts <<~HELP_MESSAGE
|
45
|
+
Usage: group_by_match_type INPUT_FILE MATCHING_TYPE [OUTPUT_FILE]
|
46
|
+
|
47
|
+
Arguments:
|
48
|
+
INPUT_FILE Path to the CSV file to process
|
49
|
+
MATCHING_TYPE One of: same_email, same_phone, same_email_or_phone
|
50
|
+
OUTPUT_FILE (Optional) Path to save the grouped CSV output
|
51
|
+
|
52
|
+
Example:
|
53
|
+
group_by_match_type contacts.csv same_email ~/Downloads/contacts_grouped.csv
|
54
|
+
HELP_MESSAGE
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require_relative 'union_find'
|
3
|
+
|
4
|
+
module GroupByMatchType
|
5
|
+
class Matcher
|
6
|
+
def initialize(input_file, match_type)
|
7
|
+
@input_file = input_file
|
8
|
+
@match_type = match_type
|
9
|
+
@union_find = UnionFind.new
|
10
|
+
end
|
11
|
+
|
12
|
+
def process(output_file = nil)
|
13
|
+
rows = CSV.read(@input_file, headers: true)
|
14
|
+
match_keys = build_match_keys(rows)
|
15
|
+
|
16
|
+
# First pass: create groupings
|
17
|
+
match_keys.each_with_index do |keys, index|
|
18
|
+
@union_find.find_or_create(keys + [index.to_s]) # Ensure uniqueness
|
19
|
+
end
|
20
|
+
|
21
|
+
# Second pass: assign group_ids
|
22
|
+
group_id_map = {}
|
23
|
+
rows.each_with_index do |_, index|
|
24
|
+
root = @union_find.find(index.to_s)
|
25
|
+
group_id_map[root] ||= group_id_map.size + 1
|
26
|
+
end
|
27
|
+
|
28
|
+
# Determine output file path
|
29
|
+
output_file ||= @input_file.sub(/\.csv$/, '_grouped.csv')
|
30
|
+
CSV.open(output_file, 'w') do |csv|
|
31
|
+
csv << rows.headers + ['group_id']
|
32
|
+
rows.each_with_index do |row, index|
|
33
|
+
root = @union_find.find(index.to_s)
|
34
|
+
group_id = group_id_map[root]
|
35
|
+
csv << row.fields + [group_id]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
output_file
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def build_match_keys(rows)
|
45
|
+
rows.map do |row|
|
46
|
+
case @match_type
|
47
|
+
when 'same_email'
|
48
|
+
emails = extract_emails(row)
|
49
|
+
emails.compact.uniq.map(&:downcase).sort
|
50
|
+
when 'same_phone'
|
51
|
+
phones = extract_phones(row)
|
52
|
+
phones.compact.uniq.map { |phone| sanitize_phone(phone) }.compact.sort
|
53
|
+
when 'same_email_or_phone'
|
54
|
+
(extract_emails(row) + extract_phones(row))
|
55
|
+
.compact
|
56
|
+
.map { |value| value.include?('@') ? value.downcase : sanitize_phone(value) }
|
57
|
+
.compact.uniq.sort
|
58
|
+
else
|
59
|
+
raise ArgumentError, "Unknown match type: #{@match_type}"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def extract_emails(row)
|
65
|
+
row.headers.grep(/email/i).map { |header| row[header]&.strip }
|
66
|
+
end
|
67
|
+
|
68
|
+
def extract_phones(row)
|
69
|
+
row.headers.grep(/phone/i).map { |header| row[header]&.strip }
|
70
|
+
end
|
71
|
+
|
72
|
+
def sanitize_phone(phone)
|
73
|
+
return nil if phone.nil?
|
74
|
+
|
75
|
+
digits = phone.gsub(/\D/, '')
|
76
|
+
if digits.length == 11 && digits.start_with?('1')
|
77
|
+
digits[1..]
|
78
|
+
else
|
79
|
+
digits
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module GroupByMatchType
|
2
|
+
class UnionFind
|
3
|
+
def initialize
|
4
|
+
@parent = {}
|
5
|
+
end
|
6
|
+
|
7
|
+
def find(x)
|
8
|
+
@parent[x] = find(@parent[x]) if @parent[x] && @parent[x] != x
|
9
|
+
@parent[x] ||= x
|
10
|
+
end
|
11
|
+
|
12
|
+
def union(x, y)
|
13
|
+
root_x = find(x)
|
14
|
+
root_y = find(y)
|
15
|
+
@parent[root_x] = root_y unless root_x == root_y
|
16
|
+
end
|
17
|
+
|
18
|
+
def find_or_create(items)
|
19
|
+
return nil if items.empty?
|
20
|
+
items.map!(&:to_s)
|
21
|
+
roots = items.map { |item| find(item) }
|
22
|
+
main_root = roots.first
|
23
|
+
roots[1..].each { |r| union(main_root, r) }
|
24
|
+
main_root
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'group_by_match_type/version'
|
4
|
+
require_relative 'group_by_match_type/matcher'
|
5
|
+
require_relative 'group_by_match_type/cli'
|
6
|
+
|
7
|
+
module GroupByMatchType
|
8
|
+
class Error < StandardError; end
|
9
|
+
# Your code goes here...
|
10
|
+
end
|
metadata
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: group_by_match_type
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Justin Boltz
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2025-04-19 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: csv
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '3.0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '3.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '13.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '13.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '3.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '3.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rubocop
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.21'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.21'
|
83
|
+
description: This gem processes CSV files and groups records that might represent
|
84
|
+
the same person based on matching email addresses, phone numbers, or both
|
85
|
+
email:
|
86
|
+
- boltz.justin@gmail.com
|
87
|
+
executables:
|
88
|
+
- group_by_match_type
|
89
|
+
extensions: []
|
90
|
+
extra_rdoc_files: []
|
91
|
+
files:
|
92
|
+
- LICENSE.txt
|
93
|
+
- README.md
|
94
|
+
- exe/group_by_match_type
|
95
|
+
- lib/group_by_match_type.rb
|
96
|
+
- lib/group_by_match_type/cli.rb
|
97
|
+
- lib/group_by_match_type/matcher.rb
|
98
|
+
- lib/group_by_match_type/union_find.rb
|
99
|
+
- lib/group_by_match_type/version.rb
|
100
|
+
homepage: https://github.com/justinboltz/group_by_match_type
|
101
|
+
licenses:
|
102
|
+
- MIT
|
103
|
+
metadata:
|
104
|
+
homepage_uri: https://github.com/justinboltz/group_by_match_type
|
105
|
+
source_code_uri: https://github.com/justinboltz/group_by_match_type
|
106
|
+
changelog_uri: https://github.com/justinboltz/group_by_match_type/blob/master/CHANGELOG.md
|
107
|
+
post_install_message:
|
108
|
+
rdoc_options: []
|
109
|
+
require_paths:
|
110
|
+
- lib
|
111
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: 2.6.0
|
116
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
|
+
requirements:
|
118
|
+
- - ">="
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '0'
|
121
|
+
requirements: []
|
122
|
+
rubygems_version: 3.5.3
|
123
|
+
signing_key:
|
124
|
+
specification_version: 4
|
125
|
+
summary: A gem for matching and grouping CSV records based on different criteria
|
126
|
+
test_files: []
|