comma_splice 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: e0a958c45c6fe94fbc6a4af1cb5fc2a0c20cf7a68728520d227b62a79e2078a5
4
+ data.tar.gz: 7e4e09b4a3308a2ccb2c5a58d1b56ced9f6f5927bf41d89680c4401b5aad2b5d
5
+ SHA512:
6
+ metadata.gz: 1daff9cfdd5f54c37bd47edc83b02308a0f3fe7b0d8af86dd0a910e264929a5b5b1191386b3b06df42f157770c4bc733c656e3beb4d9bafe10d7bc65a5483185
7
+ data.tar.gz: f6e4c32e2e03d3bf915a9b8360146eaa34ed9b34a954cb4b9a74b08c68ef4f8ae15cba83c854f96b88b098b791f802200992e46194a373e27cdd388c40311136
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ .DS_Store
10
+ .byebug_history
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in comma_splice.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,52 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ comma_splice (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ activesupport (5.2.3)
10
+ concurrent-ruby (~> 1.0, >= 1.0.2)
11
+ i18n (>= 0.7, < 2)
12
+ minitest (~> 5.1)
13
+ tzinfo (~> 1.1)
14
+ byebug (11.0.1)
15
+ concurrent-ruby (1.1.5)
16
+ diff-lcs (1.3)
17
+ i18n (1.6.0)
18
+ concurrent-ruby (~> 1.0)
19
+ minitest (5.11.3)
20
+ rake (10.5.0)
21
+ rspec (3.8.0)
22
+ rspec-core (~> 3.8.0)
23
+ rspec-expectations (~> 3.8.0)
24
+ rspec-mocks (~> 3.8.0)
25
+ rspec-core (3.8.2)
26
+ rspec-support (~> 3.8.0)
27
+ rspec-expectations (3.8.4)
28
+ diff-lcs (>= 1.2.0, < 2.0)
29
+ rspec-support (~> 3.8.0)
30
+ rspec-mocks (3.8.1)
31
+ diff-lcs (>= 1.2.0, < 2.0)
32
+ rspec-support (~> 3.8.0)
33
+ rspec-support (3.8.2)
34
+ thor (0.20.3)
35
+ thread_safe (0.3.6)
36
+ tzinfo (1.2.5)
37
+ thread_safe (~> 0.1)
38
+
39
+ PLATFORMS
40
+ ruby
41
+
42
+ DEPENDENCIES
43
+ activesupport
44
+ bundler (~> 2.0)
45
+ byebug
46
+ comma_splice!
47
+ rake (~> 10.0)
48
+ rspec
49
+ thor
50
+
51
+ BUNDLED WITH
52
+ 2.0.1
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2019 Jeff Keen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,139 @@
1
+ # Comma Splice
2
+
3
+ This gem tackles one very specific problem: when CSVs have commas in the values and the values haven't been quoted. This determines which commas separate fields and which commas are part of a value, and corrects the file.
4
+
5
+ For example, given the following CSV
6
+
7
+ ```
8
+ timestamp,artist,title,albumtitle,label
9
+ 01-27-2019 @ 12:34:00,Lester Sterling, Lynn Taitt & The Jets,Check Point Charlie,Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968,Dub Store,
10
+ 01-27-2019 @ 12:31:00,Lester Sterling,Lester Sterling Special,Merritone Rock Steady 2: This Music Got Soul 1966-1967,Dub Store,
11
+
12
+ ```
13
+
14
+ which parses incorrectly as:
15
+
16
+ | timestamp | artist | title | albumtitle | label |
17
+ |-----------------------|-----------------|-------------|-----------------|------------------------------------------------------------|
18
+ | 01-27-2019 @ 12:34:00 | Lester Sterling | Lynn Taitt & The Jets | Check Point Charlie | Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968
19
+ | 01-27-2019 @ 12:31:00 | Lester Sterling | Lester Sterling Special | Merritone Rock Steady 2: This Music Got Soul 1966-1967 | Dub Store |
20
+
21
+
22
+ Running this through `comma_splice fix /path/to/file` will return this corrected content:
23
+
24
+ ```
25
+ timestamp,artist,title,albumtitle,label
26
+ 01-27-2019 @ 12:34:00,"Lester Sterling, Lynn Taitt & The Jets",Check Point Charlie,Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968,Dub Store,
27
+ 01-27-2019 @ 12:31:00,Lester Sterling,Lester Sterling Special,Merritone Rock Steady 2: This Music Got Soul 1966-1967,Dub Store,
28
+ ```
29
+
30
+ | timestamp | artist | title | albumtitle | label |
31
+ |-----------------------|-----------------|-------------|-----------------|------------------------------------------------------------|
32
+ | 01-27-2019 @ 12:34:00 | Lester Sterling, Lynn Taitt & The Jets | Check Point Charlie | Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968 | Dub Store |
33
+ | 01-27-2019 @ 12:31:00 | Lester Sterling | Lester Sterling Special | Merritone Rock Steady 2: This Music Got Soul 1966-1967 | Dub Store |
34
+
35
+
36
+ If it can't determine where the comma should go, it prompts you for the possible options
37
+
38
+
39
+ given the following CSV:
40
+
41
+ ```
42
+ playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest
43
+ 16851097,,,12-09-2017 @ 09:57:00,10,000 Maniacs and Michael Stipe,To Sir with Love,Campfire Songs,Rhino,post,live,y,
44
+ 16851096,,,12-09-2017 @ 09:44:00,Fran Jeffries,Mine Eyes,Fran Can Really Hang You Up the Most,Warwick,post,live,y,
45
+ ```
46
+
47
+ It prompts:
48
+
49
+ ```
50
+ Which one of these is correct?
51
+
52
+ (1) artist : 10
53
+ title : 000 Maniacs and Michael Stipe
54
+ albumtitle: To Sir with Love
55
+ label : "Campfire Songs,Rhino"
56
+
57
+ (2) artist : 10
58
+ title : 000 Maniacs and Michael Stipe
59
+ albumtitle: "To Sir with Love,Campfire Songs"
60
+ label : Rhino
61
+
62
+ (3) artist : 10
63
+ title : "000 Maniacs and Michael Stipe,To Sir with Love"
64
+ albumtitle: Campfire Songs
65
+ label : Rhino
66
+
67
+ (4) artist : "10,000 Maniacs and Michael Stipe"
68
+ title : To Sir with Love
69
+ albumtitle: Campfire Songs
70
+ label : Rhino
71
+ ```
72
+
73
+ Select an option (4), and it returns:
74
+
75
+ ```
76
+ playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest
77
+ 16851097,,,12-09-2017 @ 09:57:00,"10,000 Maniacs and Michael Stipe",To Sir with Love,Campfire Songs,Rhino,post,live,y,
78
+ 16851096,,,12-09-2017 @ 09:44:00,Fran Jeffries,Mine Eyes,Fran Can Really Hang You Up the Most,Warwick,post,live,y,
79
+ ```
80
+
81
+ ## Usage
82
+
83
+ You can use this in a ruby program by using installing the `comma_splice` gem, or you can install it on your system and use the `comma_splice` command line utility.
84
+
85
+
86
+ ##### Return the number of bad lines in a file
87
+
88
+ ```ruby
89
+ CommaSplice::FileCorrector.new(file_path).bad_lines.size
90
+ ```
91
+ ```
92
+ comma_splice bad_line_count /path/to/file.csv
93
+ ```
94
+
95
+ ##### Display the fixed contents
96
+ ```ruby
97
+ CommaSplice::FileCorrector.new(file_path).corrected
98
+ ```
99
+ ```bash
100
+ comma_splice correct /path/to/file.csv
101
+ ```
102
+
103
+ ##### Process a file and save the fixed version
104
+ ```ruby
105
+ CommaSplice::FileCorrector.new(file_path).save(save_path)
106
+ ```
107
+ ```bash
108
+ comma_splice fix /path/to/file.csv /path/to/save
109
+ ```
110
+
111
+ ## Installation
112
+
113
+ Add this line to your application's Gemfile:
114
+
115
+ ```ruby
116
+ gem 'comma_splice'
117
+ ```
118
+
119
+ And then execute:
120
+
121
+ $ bundle
122
+
123
+ Or install it yourself as:
124
+
125
+ $ gem install comma_splice
126
+
127
+ ## Development
128
+
129
+ After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
130
+
131
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
132
+
133
+ ## Contributing
134
+
135
+ Bug reports and pull requests are welcome on GitHub at https://github.com/jkeen/comma_splice.
136
+
137
+ ## License
138
+
139
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
data/bin/comma_splice ADDED
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'comma_splice'
5
+ require 'thor'
6
+
7
+ class CommaSpliceCLI < Thor
8
+ class_option :start_line, type: :numeric, default: nil
9
+ class_option :end_line, type: :numeric, default: nil
10
+
11
+ desc 'fix FILE_PATH [SAVE_PATH]', 'return corrected file contents'
12
+ def correct(file_path)
13
+ file_corrector = CommaSplice::FileCorrector.new(
14
+ file_path,
15
+ start_line: options[:start_line],
16
+ end_line: options[:end_line]
17
+ )
18
+
19
+ puts file_corrector.corrected
20
+ end
21
+
22
+ desc 'fix FILE_PATH [SAVE_PATH]', 'return corrected file contents'
23
+ def fix(file_path, fix_path)
24
+ file_corrector = CommaSplice::FileCorrector.new(
25
+ file_path,
26
+ start_line: options[:start_line],
27
+ end_line: options[:end_line]
28
+ )
29
+
30
+ file_corrector.save(fix_path)
31
+ end
32
+
33
+ desc 'bad_lines FILE_PATH', 'show bad lines'
34
+ def bad_lines(file_path)
35
+ file_corrector = CommaSplice::FileCorrector.new(
36
+ file_path,
37
+ start_line: options[:start_line],
38
+ end_line: options[:end_line]
39
+ )
40
+
41
+ puts file_corrector.bad_lines
42
+ end
43
+
44
+ desc 'bad_line_count FILE_PATH', 'check file contents for needed corrections'
45
+ def bad_line_count(file_path)
46
+ file_corrector = CommaSplice::FileCorrector.new(
47
+ file_path,
48
+ start_line: options[:start_line],
49
+ end_line: options[:end_line]
50
+ )
51
+
52
+ puts file_corrector.bad_lines.size
53
+ end
54
+ end
55
+
56
+ CommaSpliceCLI.start(ARGV)
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "comma_splice"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,46 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "comma_splice/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "comma_splice"
8
+ spec.version = CommaSplice::VERSION
9
+ spec.authors = ["Jeff Keen"]
10
+ spec.email = ["jeff@keen.me"]
11
+
12
+ spec.summary = %q{Fixes CSVs with unescaped commas}
13
+ spec.description = %q{}
14
+ spec.homepage = "http://github.com/jkeen/comma_splice"
15
+ spec.license = "MIT"
16
+
17
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
19
+ if spec.respond_to?(:metadata)
20
+ spec.metadata["allowed_push_host"] = 'https://rubygems.org'
21
+
22
+ # spec.metadata["homepage_uri"] = spec.homepage
23
+ # spec.metadata["source_code_uri"] = "http://github.com/jkeen/comma_splice"
24
+ # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
25
+ else
26
+ raise "RubyGems 2.0 or newer is required to protect against " \
27
+ "public gem pushes."
28
+ end
29
+
30
+ # Specify which files should be added to the gem when it is released.
31
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
32
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
33
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
34
+ end
35
+ spec.bindir = "bin"
36
+ spec.executables << 'comma_splice'
37
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
38
+ spec.require_paths = ["lib"]
39
+
40
+ spec.add_development_dependency "bundler", "~> 2.0"
41
+ spec.add_development_dependency "rake", "~> 10.0"
42
+ spec.add_development_dependency "rspec"
43
+ spec.add_development_dependency "byebug"
44
+ spec.add_development_dependency "activesupport"
45
+ spec.add_development_dependency "thor"
46
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+ require 'active_support/core_ext/string'
5
+ require 'comma_splice/version'
6
+ require 'comma_splice/helpers/content_finder'
7
+ require 'comma_splice/helpers/variable_column_finder'
8
+ require 'comma_splice/helpers/line'
9
+ require 'comma_splice/helpers/join_possibilities'
10
+ require 'comma_splice/helpers/comma_calculator'
11
+
12
+ require 'comma_splice/line_corrector'
13
+ require 'comma_splice/file_corrector'
14
+ require 'byebug'
15
+
16
+ module CommaSplice
17
+ class Error < StandardError; end
18
+ end
@@ -0,0 +1,81 @@
1
+ module CommaSplice
2
+ class FileCorrector
3
+ attr_reader :file_contents, :csv_content, :start_line, :end_line, :start_column, :end_column
4
+
5
+ def initialize(file_path, start_line: nil, end_line:nil, start_column: nil, end_column: nil)
6
+ @file_path = file_path
7
+ @file_contents = File.read(file_path, encoding: 'utf-8')
8
+
9
+ @content_finder = ContentFinder.new(@file_contents, start_line, end_line)
10
+ @csv_content = @content_finder.content
11
+ @start_line = @content_finder.start_line
12
+ @end_line = @content_finder.start_line
13
+
14
+ if start_column && end_column
15
+ @start_column = start_column
16
+ @end_column = end_column
17
+ else
18
+ finder = VariableColumnFinder.new(@csv_content[0], @csv_content[1..-1])
19
+ @start_column = finder.start_column
20
+ @end_column = finder.end_column
21
+ end
22
+
23
+ raise CommaSplice::Error, "empty contents #{file_path}" unless @csv_content.present?
24
+ end
25
+
26
+ def header
27
+ @header ||= Line.new(csv_content.first)
28
+ end
29
+
30
+ def bad_lines
31
+ line_correctors.select(&:needs_correcting?).collect(&:original)
32
+ end
33
+
34
+ def needs_correcting?
35
+ bad_lines.size.positive?
36
+ end
37
+
38
+ def corrected
39
+ @corrected ||= [
40
+ @file_contents.lines[0, @start_line],
41
+ corrected_lines,
42
+ @file_contents.lines[@end_line, -1]
43
+ ].flatten
44
+ end
45
+
46
+ def save!
47
+ save(@file_path)
48
+ end
49
+
50
+ def save(path)
51
+ File.open(path, 'w+') do |f|
52
+ corrected.each_with_index do |line, index|
53
+ # don't add an extra line break at the end
54
+ f.puts line if corrected.size > index && line
55
+ end
56
+ end
57
+ end
58
+
59
+ def to_json
60
+ @content_finder.parsed.try(:to_json)
61
+ end
62
+
63
+ private
64
+
65
+ def line_correctors
66
+ @line_correctors ||= csv_content.collect do |line|
67
+ LineCorrector.new(header, Line.new(line), @start_column, @end_column)
68
+ end
69
+ end
70
+
71
+ def corrected_lines
72
+ line_correctors.collect do |line|
73
+ if line.needs_correcting?
74
+ line.corrected
75
+ else
76
+ line.original
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,86 @@
1
+ module CommaSplice
2
+ # provide an array of CSV headers and and array of CSV values
3
+ # and this will figure out the best correction and prompt
4
+ # you if it can't find out
5
+
6
+ class CommaCalculator
7
+ def initialize(headers, values)
8
+ @headers = headers
9
+ @values = values
10
+
11
+ raise StandardError, "Determining all the possibilities to fit #{@values.size} values into the #{@headers.size} headers #{@headers.inspect} is computationally expensive. Please specify the columns where commas might be." if @headers.size > 10 && @values.size > 10
12
+ end
13
+
14
+ def correction
15
+ if @headers.size === @values.size
16
+ @values
17
+ elsif best_options.size == 1
18
+ best_options.first
19
+ elsif best_options.size > 1
20
+ prompt_for_options(best_options)
21
+ else
22
+ prompt_for_options(all_options)
23
+ end
24
+ end
25
+
26
+ def all_options
27
+ options = join_possibilities.collect do |joins|
28
+ values = @values.dup
29
+ joins.collect do |join_num|
30
+ val = values.shift(join_num)
31
+ if val.size > 1
32
+ quoted_values(val)
33
+ else
34
+ val.join(',')
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ def best_options
41
+ all_options.select do |option|
42
+ option.none? { |o| o.starts_with?(' ') || o.starts_with?('" ') }
43
+ end
44
+ end
45
+
46
+ def requires_manual_input?
47
+ needs_correcting? && best_options.many?
48
+ end
49
+
50
+ def needs_correcting?
51
+ @headers.size < @values.size
52
+ end
53
+
54
+ private
55
+
56
+ def quoted_values(values)
57
+ "\"#{values.join(',').gsub(/(?<!")(?:"{2})*\K\"/, '""')}\"" # escape a double quote if it hasn't been escaped already
58
+ end
59
+
60
+ def join_possibilities
61
+ JoinPossibilities.new(@values.size, @headers.size).possibilities
62
+ end
63
+
64
+ def prompt_for_options(options)
65
+ longest_header = @headers.max_by(&:length)
66
+
67
+ options.each_with_index do |option, index|
68
+ @headers.each_with_index do |header, i|
69
+ marker = i.zero? ? "(#{index + 1})" : ''
70
+ puts marker.ljust(5) +
71
+ header.ljust(longest_header.size) + ': ' +
72
+ option[i]
73
+ end
74
+ puts "\n"
75
+ end
76
+
77
+ selected_option = nil
78
+ until selected_option && selected_option.to_i > 0
79
+ puts 'which one is correct?'
80
+ selected_option = STDIN.gets
81
+ end
82
+
83
+ options[selected_option.to_i - 1]
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,47 @@
1
+ module CommaSplice
2
+ # Given a file this will find the CSV content. Some files have some non-csv junk at the top
3
+
4
+ class ContentFinder
5
+ attr_reader :start_line, :end_line, :content
6
+
7
+ def initialize(file_contents, start_line = nil, end_line = nil)
8
+ @file_contents = file_contents
9
+
10
+ if start_line && end_line
11
+ # the csvs this was built for have non-csv headers
12
+ @start_line = start_line
13
+ @end_line = end_line
14
+ @content = @file_contents.lines[@start_line..@end_line]
15
+ else
16
+ find_content
17
+ end
18
+ end
19
+
20
+ def find_content
21
+ @start_line = @file_contents.lines.find_index do |line|
22
+ Line.new(line).values.size > 2
23
+ end
24
+
25
+ relative_end_line = @file_contents.lines[@start_line..-1].find_index do |line|
26
+ Line.new(line).values.size < 2
27
+ end
28
+
29
+ if relative_end_line
30
+ @end_line = @start_line + relative_end_line - 1
31
+ else
32
+ @end_line = -1
33
+ end
34
+
35
+ @content = @file_contents.lines[@start_line..@end_line]
36
+ end
37
+
38
+ def parsed
39
+ quote_chars = %w[" | ~ ^ & *]
40
+ begin
41
+ CSV.parse(@content.join('\n'), quote_char: quote_chars.shift, headers: :first_row, liberal_parsing: true)
42
+ rescue CSV::MalformedCSVError
43
+ quote_chars.empty? ? raise : retry
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CommaSplice
4
+ class JoinPossibilities
5
+ attr_reader :from_size, :to_size
6
+
7
+ def initialize(value_count, header_count)
8
+ @from_size = value_count
9
+ @to_size = header_count
10
+ end
11
+
12
+ def possibilities
13
+ @possibilities ||= permutations(combos(from_size, to_size))
14
+ end
15
+
16
+ private
17
+
18
+ def permutations(combinations)
19
+ # get all permutations of those combinations
20
+ # to determine every possibility of join
21
+
22
+ all_permutations = combinations.collect do |combo|
23
+ combo.permutation(to_size).to_a
24
+ end
25
+
26
+ # flatten down to a list of arrays
27
+ all_permutations.flatten(1).uniq
28
+ end
29
+
30
+ def combos(desired_size, count, minimum = 1)
31
+ # determine all combinations of [count] numbers that add up to [desired_size]
32
+ # e.g if we have an array of 6 items and want an array of 4 items
33
+ # we need 4 numbers that add up to 6, => [[1, 1, 1, 3], [1, 1, 2, 2]]
34
+
35
+ return [] if desired_size < count || desired_size < minimum
36
+ return [desired_size] if count == 1
37
+
38
+ (minimum..desired_size - 1).flat_map do |i|
39
+ combos(desired_size - i, count - 1, i).map { |r| [i, *r] }
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,21 @@
1
+ module CommaSplice
2
+ class Line
3
+ attr_reader :values, :line
4
+
5
+ def initialize(line)
6
+ @line = line
7
+ @values = parse_csv_content(line).first
8
+ end
9
+
10
+ private
11
+
12
+ def parse_csv_content(content, headers = false)
13
+ quote_chars = %w[" | ~ ^ & *]
14
+ begin
15
+ CSV.parse(content.mb_chars.tidy_bytes.to_s, quote_char: quote_chars.shift, headers: headers, liberal_parsing: true)
16
+ rescue CSV::MalformedCSVError
17
+ quote_chars.empty? ? raise : retry
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CommaSplice
4
+ # Given a header line and some value lines this will try to figure out the columns
5
+ # where it's likely an error might be.
6
+
7
+ # Columns on the left and right bounds will be ignored if each line has the same length
8
+
9
+ # For example, the following CSV will evaluate with @start_column = 5 and @end_column = -4
10
+ # since in this example playid, playtime, genre, and timestamp are all non-variable on the left,
11
+ # and prepost, programtype, iswebcast, and isrequest are non-variable on the right
12
+
13
+ # playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest
14
+ # 17385098,,,01-27-2019 @ 13:58:00,Niney & Soul Syndicate,So Long Dub,Dub Box Set Vol. 2,Trojan,post,live,y,
15
+ # 17385097,,,01-27-2019 @ 13:57:00,King Tubby,Love Thy Neighbor,Jesus Dread,Blood & Fire,post,live,y,
16
+ # 17385096,,,01-27-2019 @ 13:53:00,King Tubby / The Aggrovators,Declaration Of Dub,Dub From The Roots,Charly,post,live,y,
17
+ # 17385095,,,01-27-2019 @ 13:50:00,Harry Mudie / King Tubby,Dub With A Difference,In Dub Conference Vol. 1,Moodisc,post,live,y,
18
+ # 17385094,,,01-27-2019 @ 13:47:00,KIng Tubby Meets The Upsetter,King And The Upsetter At Spanish Town,KIng Tubby Meets The Upsetter,Celluloid,post,live,y,
19
+
20
+ class VariableColumnFinder
21
+ attr_reader :start_column, :end_column
22
+
23
+ def initialize(header_line, value_lines)
24
+ @values = value_lines
25
+ @header = header_line
26
+
27
+ find_variable_column_boundaries
28
+ end
29
+
30
+ def find_variable_column_boundaries
31
+ # Now given both of these, we can eliminate some columns on the left and right
32
+ variables = left_to_right_index.zip(right_to_left_index).map do |pair|
33
+ pair == [false, false]
34
+ end
35
+
36
+ start_column = variables.find_index(true)
37
+ end_column = variables.reverse.find_index(true) * -1
38
+
39
+ @start_column = start_column
40
+ @end_column = end_column
41
+ end
42
+
43
+ private
44
+
45
+ def left_to_right_index
46
+ left_to_right_index = []
47
+ @header.split(',').size.times do |time|
48
+ left_to_right_index.push(@values.map do |value_line|
49
+ value_line.split(',')[time].size
50
+ end.uniq.size == 1)
51
+ end
52
+
53
+ left_to_right_index
54
+ end
55
+
56
+ def right_to_left_index
57
+ right_to_left_index = []
58
+ @header.split(',').size.times do |time|
59
+ right_to_left_index.unshift(@values.map do |value_line|
60
+ value_line.split(',')[-time].size
61
+ end.uniq.size == 1)
62
+ end
63
+
64
+ right_to_left_index
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,66 @@
1
+ module CommaSplice
2
+ class LineCorrector
3
+ attr_reader :headers, :values, :header_line, :value_line, :right_bounds, :left_bounds
4
+
5
+ def initialize(header_line, value_line, left_bounds = 0, right_bounds = -1)
6
+ header_line = Line.new(header_line) unless header_line.is_a?(Line)
7
+ value_line = Line.new(value_line) unless value_line.is_a?(Line)
8
+
9
+ @header_line = header_line
10
+ @value_line = value_line
11
+ @headers = header_line.values
12
+ @values = value_line.values
13
+ @left_bounds = left_bounds
14
+ @right_bounds = right_bounds
15
+
16
+ raise 'right bounds must be less than -1' unless right_bounds < 0
17
+ raise 'left bounds must be greater than zero' unless left_bounds >= 0
18
+ end
19
+
20
+ def needs_correcting?
21
+ @values && @values.size > 0 && @headers.size != @values.size
22
+ end
23
+
24
+ def original
25
+ @values.join(',')
26
+ end
27
+
28
+ def corrected
29
+ # you want to provide this with the smallest set of possibilities
30
+ # for performance reasons. Left and right bounds limit the values
31
+ # where the comma error could be
32
+
33
+ # For instance, with the following headers:
34
+ # [playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest]
35
+ # the only values that could contain an extra comma are "artist,title,albumtitle,label"
36
+ # therefore our left_bounds = 4, right_bounds = -5
37
+
38
+ values_before = if left_bounds > 0
39
+ values[0..(left_bounds - 1)]
40
+ else
41
+ []
42
+ end
43
+
44
+ values_after = if right_bounds < -1
45
+ values[(right_bounds + 1)..-1]
46
+ else
47
+ []
48
+ end
49
+ [values_before, corrector.correction, values_after].flatten.join(',')
50
+ end
51
+
52
+ private
53
+
54
+ def corrector
55
+ CommaCalculator.new(selected_headers, selected_values)
56
+ end
57
+
58
+ def selected_headers
59
+ headers[left_bounds..right_bounds]
60
+ end
61
+
62
+ def selected_values
63
+ values[left_bounds..right_bounds]
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,3 @@
1
+ module CommaSplice
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,150 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: comma_splice
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jeff Keen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-08-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: byebug
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: activesupport
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: thor
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: ''
98
+ email:
99
+ - jeff@keen.me
100
+ executables:
101
+ - comma_splice
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - ".gitignore"
106
+ - ".rspec"
107
+ - Gemfile
108
+ - Gemfile.lock
109
+ - LICENSE.txt
110
+ - README.md
111
+ - Rakefile
112
+ - bin/comma_splice
113
+ - bin/console
114
+ - bin/setup
115
+ - comma_splice.gemspec
116
+ - lib/comma_splice.rb
117
+ - lib/comma_splice/file_corrector.rb
118
+ - lib/comma_splice/helpers/comma_calculator.rb
119
+ - lib/comma_splice/helpers/content_finder.rb
120
+ - lib/comma_splice/helpers/join_possibilities.rb
121
+ - lib/comma_splice/helpers/line.rb
122
+ - lib/comma_splice/helpers/variable_column_finder.rb
123
+ - lib/comma_splice/line_corrector.rb
124
+ - lib/comma_splice/version.rb
125
+ homepage: http://github.com/jkeen/comma_splice
126
+ licenses:
127
+ - MIT
128
+ metadata:
129
+ allowed_push_host: https://rubygems.org
130
+ post_install_message:
131
+ rdoc_options: []
132
+ require_paths:
133
+ - lib
134
+ required_ruby_version: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ required_rubygems_version: !ruby/object:Gem::Requirement
140
+ requirements:
141
+ - - ">="
142
+ - !ruby/object:Gem::Version
143
+ version: '0'
144
+ requirements: []
145
+ rubyforge_project:
146
+ rubygems_version: 2.7.8
147
+ signing_key:
148
+ specification_version: 4
149
+ summary: Fixes CSVs with unescaped commas
150
+ test_files: []