comma_splice 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: e0a958c45c6fe94fbc6a4af1cb5fc2a0c20cf7a68728520d227b62a79e2078a5
4
+ data.tar.gz: 7e4e09b4a3308a2ccb2c5a58d1b56ced9f6f5927bf41d89680c4401b5aad2b5d
5
+ SHA512:
6
+ metadata.gz: 1daff9cfdd5f54c37bd47edc83b02308a0f3fe7b0d8af86dd0a910e264929a5b5b1191386b3b06df42f157770c4bc733c656e3beb4d9bafe10d7bc65a5483185
7
+ data.tar.gz: f6e4c32e2e03d3bf915a9b8360146eaa34ed9b34a954cb4b9a74b08c68ef4f8ae15cba83c854f96b88b098b791f802200992e46194a373e27cdd388c40311136
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ .DS_Store
10
+ .byebug_history
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in comma_splice.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,52 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ comma_splice (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ activesupport (5.2.3)
10
+ concurrent-ruby (~> 1.0, >= 1.0.2)
11
+ i18n (>= 0.7, < 2)
12
+ minitest (~> 5.1)
13
+ tzinfo (~> 1.1)
14
+ byebug (11.0.1)
15
+ concurrent-ruby (1.1.5)
16
+ diff-lcs (1.3)
17
+ i18n (1.6.0)
18
+ concurrent-ruby (~> 1.0)
19
+ minitest (5.11.3)
20
+ rake (10.5.0)
21
+ rspec (3.8.0)
22
+ rspec-core (~> 3.8.0)
23
+ rspec-expectations (~> 3.8.0)
24
+ rspec-mocks (~> 3.8.0)
25
+ rspec-core (3.8.2)
26
+ rspec-support (~> 3.8.0)
27
+ rspec-expectations (3.8.4)
28
+ diff-lcs (>= 1.2.0, < 2.0)
29
+ rspec-support (~> 3.8.0)
30
+ rspec-mocks (3.8.1)
31
+ diff-lcs (>= 1.2.0, < 2.0)
32
+ rspec-support (~> 3.8.0)
33
+ rspec-support (3.8.2)
34
+ thor (0.20.3)
35
+ thread_safe (0.3.6)
36
+ tzinfo (1.2.5)
37
+ thread_safe (~> 0.1)
38
+
39
+ PLATFORMS
40
+ ruby
41
+
42
+ DEPENDENCIES
43
+ activesupport
44
+ bundler (~> 2.0)
45
+ byebug
46
+ comma_splice!
47
+ rake (~> 10.0)
48
+ rspec
49
+ thor
50
+
51
+ BUNDLED WITH
52
+ 2.0.1
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2019 Jeff Keen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,139 @@
1
+ # Comma Splice
2
+
3
+ This gem tackles one very specific problem: when CSVs have commas in the values and the values haven't been quoted. This determines which commas separate fields and which commas are part of a value, and corrects the file.
4
+
5
+ For example, given the following CSV
6
+
7
+ ```
8
+ timestamp,artist,title,albumtitle,label
9
+ 01-27-2019 @ 12:34:00,Lester Sterling, Lynn Taitt & The Jets,Check Point Charlie,Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968,Dub Store,
10
+ 01-27-2019 @ 12:31:00,Lester Sterling,Lester Sterling Special,Merritone Rock Steady 2: This Music Got Soul 1966-1967,Dub Store,
11
+
12
+ ```
13
+
14
+ which parses incorrectly as:
15
+
16
+ | timestamp | artist | title | albumtitle | label |
17
+ |-----------------------|-----------------|-------------|-----------------|------------------------------------------------------------|
18
+ | 01-27-2019 @ 12:34:00 | Lester Sterling | Lynn Taitt & The Jets | Check Point Charlie | Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968
19
+ | 01-27-2019 @ 12:31:00 | Lester Sterling | Lester Sterling Special | Merritone Rock Steady 2: This Music Got Soul 1966-1967 | Dub Store |
20
+
21
+
22
+ Running this through `comma_splice fix /path/to/file` will return this corrected content:
23
+
24
+ ```
25
+ timestamp,artist,title,albumtitle,label
26
+ 01-27-2019 @ 12:34:00,"Lester Sterling, Lynn Taitt & The Jets",Check Point Charlie,Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968,Dub Store,
27
+ 01-27-2019 @ 12:31:00,Lester Sterling,Lester Sterling Special,Merritone Rock Steady 2: This Music Got Soul 1966-1967,Dub Store,
28
+ ```
29
+
30
+ | timestamp | artist | title | albumtitle | label |
31
+ |-----------------------|-----------------|-------------|-----------------|------------------------------------------------------------|
32
+ | 01-27-2019 @ 12:34:00 | Lester Sterling, Lynn Taitt & The Jets | Check Point Charlie | Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968 | Dub Store |
33
+ | 01-27-2019 @ 12:31:00 | Lester Sterling | Lester Sterling Special | Merritone Rock Steady 2: This Music Got Soul 1966-1967 | Dub Store |
34
+
35
+
36
+ If it can't determine where the comma should go, it prompts you for the possible options
37
+
38
+
39
+ given the following CSV:
40
+
41
+ ```
42
+ playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest
43
+ 16851097,,,12-09-2017 @ 09:57:00,10,000 Maniacs and Michael Stipe,To Sir with Love,Campfire Songs,Rhino,post,live,y,
44
+ 16851096,,,12-09-2017 @ 09:44:00,Fran Jeffries,Mine Eyes,Fran Can Really Hang You Up the Most,Warwick,post,live,y,
45
+ ```
46
+
47
+ It prompts:
48
+
49
+ ```
50
+ Which one of these is correct?
51
+
52
+ (1) artist : 10
53
+ title : 000 Maniacs and Michael Stipe
54
+ albumtitle: To Sir with Love
55
+ label : "Campfire Songs,Rhino"
56
+
57
+ (2) artist : 10
58
+ title : 000 Maniacs and Michael Stipe
59
+ albumtitle: "To Sir with Love,Campfire Songs"
60
+ label : Rhino
61
+
62
+ (3) artist : 10
63
+ title : "000 Maniacs and Michael Stipe,To Sir with Love"
64
+ albumtitle: Campfire Songs
65
+ label : Rhino
66
+
67
+ (4) artist : "10,000 Maniacs and Michael Stipe"
68
+ title : To Sir with Love
69
+ albumtitle: Campfire Songs
70
+ label : Rhino
71
+ ```
72
+
73
+ Select an option (4), and it returns:
74
+
75
+ ```
76
+ playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest
77
+ 16851097,,,12-09-2017 @ 09:57:00,"10,000 Maniacs and Michael Stipe",To Sir with Love,Campfire Songs,Rhino,post,live,y,
78
+ 16851096,,,12-09-2017 @ 09:44:00,Fran Jeffries,Mine Eyes,Fran Can Really Hang You Up the Most,Warwick,post,live,y,
79
+ ```
80
+
81
+ ## Usage
82
+
83
+ You can use this in a ruby program by using installing the `comma_splice` gem, or you can install it on your system and use the `comma_splice` command line utility.
84
+
85
+
86
+ ##### Return the number of bad lines in a file
87
+
88
+ ```ruby
89
+ CommaSplice::FileCorrector.new(file_path).bad_lines.size
90
+ ```
91
+ ```
92
+ comma_splice bad_line_count /path/to/file.csv
93
+ ```
94
+
95
+ ##### Display the fixed contents
96
+ ```ruby
97
+ CommaSplice::FileCorrector.new(file_path).corrected
98
+ ```
99
+ ```bash
100
+ comma_splice correct /path/to/file.csv
101
+ ```
102
+
103
+ ##### Process a file and save the fixed version
104
+ ```ruby
105
+ CommaSplice::FileCorrector.new(file_path).save(save_path)
106
+ ```
107
+ ```bash
108
+ comma_splice fix /path/to/file.csv /path/to/save
109
+ ```
110
+
111
+ ## Installation
112
+
113
+ Add this line to your application's Gemfile:
114
+
115
+ ```ruby
116
+ gem 'comma_splice'
117
+ ```
118
+
119
+ And then execute:
120
+
121
+ $ bundle
122
+
123
+ Or install it yourself as:
124
+
125
+ $ gem install comma_splice
126
+
127
+ ## Development
128
+
129
+ After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
130
+
131
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
132
+
133
+ ## Contributing
134
+
135
+ Bug reports and pull requests are welcome on GitHub at https://github.com/jkeen/comma_splice.
136
+
137
+ ## License
138
+
139
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
data/bin/comma_splice ADDED
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'comma_splice'
5
+ require 'thor'
6
+
7
+ class CommaSpliceCLI < Thor
8
+ class_option :start_line, type: :numeric, default: nil
9
+ class_option :end_line, type: :numeric, default: nil
10
+
11
+ desc 'fix FILE_PATH [SAVE_PATH]', 'return corrected file contents'
12
+ def correct(file_path)
13
+ file_corrector = CommaSplice::FileCorrector.new(
14
+ file_path,
15
+ start_line: options[:start_line],
16
+ end_line: options[:end_line]
17
+ )
18
+
19
+ puts file_corrector.corrected
20
+ end
21
+
22
+ desc 'fix FILE_PATH [SAVE_PATH]', 'return corrected file contents'
23
+ def fix(file_path, fix_path)
24
+ file_corrector = CommaSplice::FileCorrector.new(
25
+ file_path,
26
+ start_line: options[:start_line],
27
+ end_line: options[:end_line]
28
+ )
29
+
30
+ file_corrector.save(fix_path)
31
+ end
32
+
33
+ desc 'bad_lines FILE_PATH', 'show bad lines'
34
+ def bad_lines(file_path)
35
+ file_corrector = CommaSplice::FileCorrector.new(
36
+ file_path,
37
+ start_line: options[:start_line],
38
+ end_line: options[:end_line]
39
+ )
40
+
41
+ puts file_corrector.bad_lines
42
+ end
43
+
44
+ desc 'bad_line_count FILE_PATH', 'check file contents for needed corrections'
45
+ def bad_line_count(file_path)
46
+ file_corrector = CommaSplice::FileCorrector.new(
47
+ file_path,
48
+ start_line: options[:start_line],
49
+ end_line: options[:end_line]
50
+ )
51
+
52
+ puts file_corrector.bad_lines.size
53
+ end
54
+ end
55
+
56
+ CommaSpliceCLI.start(ARGV)
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "comma_splice"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,46 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "comma_splice/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "comma_splice"
8
+ spec.version = CommaSplice::VERSION
9
+ spec.authors = ["Jeff Keen"]
10
+ spec.email = ["jeff@keen.me"]
11
+
12
+ spec.summary = %q{Fixes CSVs with unescaped commas}
13
+ spec.description = %q{}
14
+ spec.homepage = "http://github.com/jkeen/comma_splice"
15
+ spec.license = "MIT"
16
+
17
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
19
+ if spec.respond_to?(:metadata)
20
+ spec.metadata["allowed_push_host"] = 'https://rubygems.org'
21
+
22
+ # spec.metadata["homepage_uri"] = spec.homepage
23
+ # spec.metadata["source_code_uri"] = "http://github.com/jkeen/comma_splice"
24
+ # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
25
+ else
26
+ raise "RubyGems 2.0 or newer is required to protect against " \
27
+ "public gem pushes."
28
+ end
29
+
30
+ # Specify which files should be added to the gem when it is released.
31
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
32
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
33
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
34
+ end
35
+ spec.bindir = "bin"
36
+ spec.executables << 'comma_splice'
37
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
38
+ spec.require_paths = ["lib"]
39
+
40
+ spec.add_development_dependency "bundler", "~> 2.0"
41
+ spec.add_development_dependency "rake", "~> 10.0"
42
+ spec.add_development_dependency "rspec"
43
+ spec.add_development_dependency "byebug"
44
+ spec.add_development_dependency "activesupport"
45
+ spec.add_development_dependency "thor"
46
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+ require 'active_support/core_ext/string'
5
+ require 'comma_splice/version'
6
+ require 'comma_splice/helpers/content_finder'
7
+ require 'comma_splice/helpers/variable_column_finder'
8
+ require 'comma_splice/helpers/line'
9
+ require 'comma_splice/helpers/join_possibilities'
10
+ require 'comma_splice/helpers/comma_calculator'
11
+
12
+ require 'comma_splice/line_corrector'
13
+ require 'comma_splice/file_corrector'
14
+ require 'byebug'
15
+
16
+ module CommaSplice
17
+ class Error < StandardError; end
18
+ end
@@ -0,0 +1,81 @@
1
+ module CommaSplice
2
+ class FileCorrector
3
+ attr_reader :file_contents, :csv_content, :start_line, :end_line, :start_column, :end_column
4
+
5
+ def initialize(file_path, start_line: nil, end_line:nil, start_column: nil, end_column: nil)
6
+ @file_path = file_path
7
+ @file_contents = File.read(file_path, encoding: 'utf-8')
8
+
9
+ @content_finder = ContentFinder.new(@file_contents, start_line, end_line)
10
+ @csv_content = @content_finder.content
11
+ @start_line = @content_finder.start_line
12
+ @end_line = @content_finder.start_line
13
+
14
+ if start_column && end_column
15
+ @start_column = start_column
16
+ @end_column = end_column
17
+ else
18
+ finder = VariableColumnFinder.new(@csv_content[0], @csv_content[1..-1])
19
+ @start_column = finder.start_column
20
+ @end_column = finder.end_column
21
+ end
22
+
23
+ raise CommaSplice::Error, "empty contents #{file_path}" unless @csv_content.present?
24
+ end
25
+
26
+ def header
27
+ @header ||= Line.new(csv_content.first)
28
+ end
29
+
30
+ def bad_lines
31
+ line_correctors.select(&:needs_correcting?).collect(&:original)
32
+ end
33
+
34
+ def needs_correcting?
35
+ bad_lines.size.positive?
36
+ end
37
+
38
+ def corrected
39
+ @corrected ||= [
40
+ @file_contents.lines[0, @start_line],
41
+ corrected_lines,
42
+ @file_contents.lines[@end_line, -1]
43
+ ].flatten
44
+ end
45
+
46
+ def save!
47
+ save(@file_path)
48
+ end
49
+
50
+ def save(path)
51
+ File.open(path, 'w+') do |f|
52
+ corrected.each_with_index do |line, index|
53
+ # don't add an extra line break at the end
54
+ f.puts line if corrected.size > index && line
55
+ end
56
+ end
57
+ end
58
+
59
+ def to_json
60
+ @content_finder.parsed.try(:to_json)
61
+ end
62
+
63
+ private
64
+
65
+ def line_correctors
66
+ @line_correctors ||= csv_content.collect do |line|
67
+ LineCorrector.new(header, Line.new(line), @start_column, @end_column)
68
+ end
69
+ end
70
+
71
+ def corrected_lines
72
+ line_correctors.collect do |line|
73
+ if line.needs_correcting?
74
+ line.corrected
75
+ else
76
+ line.original
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,86 @@
1
+ module CommaSplice
2
+ # provide an array of CSV headers and and array of CSV values
3
+ # and this will figure out the best correction and prompt
4
+ # you if it can't find out
5
+
6
+ class CommaCalculator
7
+ def initialize(headers, values)
8
+ @headers = headers
9
+ @values = values
10
+
11
+ raise StandardError, "Determining all the possibilities to fit #{@values.size} values into the #{@headers.size} headers #{@headers.inspect} is computationally expensive. Please specify the columns where commas might be." if @headers.size > 10 && @values.size > 10
12
+ end
13
+
14
+ def correction
15
+ if @headers.size === @values.size
16
+ @values
17
+ elsif best_options.size == 1
18
+ best_options.first
19
+ elsif best_options.size > 1
20
+ prompt_for_options(best_options)
21
+ else
22
+ prompt_for_options(all_options)
23
+ end
24
+ end
25
+
26
+ def all_options
27
+ options = join_possibilities.collect do |joins|
28
+ values = @values.dup
29
+ joins.collect do |join_num|
30
+ val = values.shift(join_num)
31
+ if val.size > 1
32
+ quoted_values(val)
33
+ else
34
+ val.join(',')
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ def best_options
41
+ all_options.select do |option|
42
+ option.none? { |o| o.starts_with?(' ') || o.starts_with?('" ') }
43
+ end
44
+ end
45
+
46
+ def requires_manual_input?
47
+ needs_correcting? && best_options.many?
48
+ end
49
+
50
+ def needs_correcting?
51
+ @headers.size < @values.size
52
+ end
53
+
54
+ private
55
+
56
+ def quoted_values(values)
57
+ "\"#{values.join(',').gsub(/(?<!")(?:"{2})*\K\"/, '""')}\"" # escape a double quote if it hasn't been escaped already
58
+ end
59
+
60
+ def join_possibilities
61
+ JoinPossibilities.new(@values.size, @headers.size).possibilities
62
+ end
63
+
64
+ def prompt_for_options(options)
65
+ longest_header = @headers.max_by(&:length)
66
+
67
+ options.each_with_index do |option, index|
68
+ @headers.each_with_index do |header, i|
69
+ marker = i.zero? ? "(#{index + 1})" : ''
70
+ puts marker.ljust(5) +
71
+ header.ljust(longest_header.size) + ': ' +
72
+ option[i]
73
+ end
74
+ puts "\n"
75
+ end
76
+
77
+ selected_option = nil
78
+ until selected_option && selected_option.to_i > 0
79
+ puts 'which one is correct?'
80
+ selected_option = STDIN.gets
81
+ end
82
+
83
+ options[selected_option.to_i - 1]
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,47 @@
1
+ module CommaSplice
2
+ # Given a file this will find the CSV content. Some files have some non-csv junk at the top
3
+
4
+ class ContentFinder
5
+ attr_reader :start_line, :end_line, :content
6
+
7
+ def initialize(file_contents, start_line = nil, end_line = nil)
8
+ @file_contents = file_contents
9
+
10
+ if start_line && end_line
11
+ # the csvs this was built for have non-csv headers
12
+ @start_line = start_line
13
+ @end_line = end_line
14
+ @content = @file_contents.lines[@start_line..@end_line]
15
+ else
16
+ find_content
17
+ end
18
+ end
19
+
20
+ def find_content
21
+ @start_line = @file_contents.lines.find_index do |line|
22
+ Line.new(line).values.size > 2
23
+ end
24
+
25
+ relative_end_line = @file_contents.lines[@start_line..-1].find_index do |line|
26
+ Line.new(line).values.size < 2
27
+ end
28
+
29
+ if relative_end_line
30
+ @end_line = @start_line + relative_end_line - 1
31
+ else
32
+ @end_line = -1
33
+ end
34
+
35
+ @content = @file_contents.lines[@start_line..@end_line]
36
+ end
37
+
38
+ def parsed
39
+ quote_chars = %w[" | ~ ^ & *]
40
+ begin
41
+ CSV.parse(@content.join('\n'), quote_char: quote_chars.shift, headers: :first_row, liberal_parsing: true)
42
+ rescue CSV::MalformedCSVError
43
+ quote_chars.empty? ? raise : retry
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CommaSplice
4
+ class JoinPossibilities
5
+ attr_reader :from_size, :to_size
6
+
7
+ def initialize(value_count, header_count)
8
+ @from_size = value_count
9
+ @to_size = header_count
10
+ end
11
+
12
+ def possibilities
13
+ @possibilities ||= permutations(combos(from_size, to_size))
14
+ end
15
+
16
+ private
17
+
18
+ def permutations(combinations)
19
+ # get all permutations of those combinations
20
+ # to determine every possibility of join
21
+
22
+ all_permutations = combinations.collect do |combo|
23
+ combo.permutation(to_size).to_a
24
+ end
25
+
26
+ # flatten down to a list of arrays
27
+ all_permutations.flatten(1).uniq
28
+ end
29
+
30
+ def combos(desired_size, count, minimum = 1)
31
+ # determine all combinations of [count] numbers that add up to [desired_size]
32
+ # e.g if we have an array of 6 items and want an array of 4 items
33
+ # we need 4 numbers that add up to 6, => [[1, 1, 1, 3], [1, 1, 2, 2]]
34
+
35
+ return [] if desired_size < count || desired_size < minimum
36
+ return [desired_size] if count == 1
37
+
38
+ (minimum..desired_size - 1).flat_map do |i|
39
+ combos(desired_size - i, count - 1, i).map { |r| [i, *r] }
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,21 @@
1
+ module CommaSplice
2
+ class Line
3
+ attr_reader :values, :line
4
+
5
+ def initialize(line)
6
+ @line = line
7
+ @values = parse_csv_content(line).first
8
+ end
9
+
10
+ private
11
+
12
+ def parse_csv_content(content, headers = false)
13
+ quote_chars = %w[" | ~ ^ & *]
14
+ begin
15
+ CSV.parse(content.mb_chars.tidy_bytes.to_s, quote_char: quote_chars.shift, headers: headers, liberal_parsing: true)
16
+ rescue CSV::MalformedCSVError
17
+ quote_chars.empty? ? raise : retry
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CommaSplice
4
+ # Given a header line and some value lines this will try to figure out the columns
5
+ # where it's likely an error might be.
6
+
7
+ # Columns on the left and right bounds will be ignored if each line has the same length
8
+
9
+ # For example, the following CSV will evaluate with @start_column = 5 and @end_column = -4
10
+ # since in this example playid, playtime, genre, and timestamp are all non-variable on the left,
11
+ # and prepost, programtype, iswebcast, and isrequest are non-variable on the right
12
+
13
+ # playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest
14
+ # 17385098,,,01-27-2019 @ 13:58:00,Niney & Soul Syndicate,So Long Dub,Dub Box Set Vol. 2,Trojan,post,live,y,
15
+ # 17385097,,,01-27-2019 @ 13:57:00,King Tubby,Love Thy Neighbor,Jesus Dread,Blood & Fire,post,live,y,
16
+ # 17385096,,,01-27-2019 @ 13:53:00,King Tubby / The Aggrovators,Declaration Of Dub,Dub From The Roots,Charly,post,live,y,
17
+ # 17385095,,,01-27-2019 @ 13:50:00,Harry Mudie / King Tubby,Dub With A Difference,In Dub Conference Vol. 1,Moodisc,post,live,y,
18
+ # 17385094,,,01-27-2019 @ 13:47:00,KIng Tubby Meets The Upsetter,King And The Upsetter At Spanish Town,KIng Tubby Meets The Upsetter,Celluloid,post,live,y,
19
+
20
+ class VariableColumnFinder
21
+ attr_reader :start_column, :end_column
22
+
23
+ def initialize(header_line, value_lines)
24
+ @values = value_lines
25
+ @header = header_line
26
+
27
+ find_variable_column_boundaries
28
+ end
29
+
30
+ def find_variable_column_boundaries
31
+ # Now given both of these, we can eliminate some columns on the left and right
32
+ variables = left_to_right_index.zip(right_to_left_index).map do |pair|
33
+ pair == [false, false]
34
+ end
35
+
36
+ start_column = variables.find_index(true)
37
+ end_column = variables.reverse.find_index(true) * -1
38
+
39
+ @start_column = start_column
40
+ @end_column = end_column
41
+ end
42
+
43
+ private
44
+
45
+ def left_to_right_index
46
+ left_to_right_index = []
47
+ @header.split(',').size.times do |time|
48
+ left_to_right_index.push(@values.map do |value_line|
49
+ value_line.split(',')[time].size
50
+ end.uniq.size == 1)
51
+ end
52
+
53
+ left_to_right_index
54
+ end
55
+
56
+ def right_to_left_index
57
+ right_to_left_index = []
58
+ @header.split(',').size.times do |time|
59
+ right_to_left_index.unshift(@values.map do |value_line|
60
+ value_line.split(',')[-time].size
61
+ end.uniq.size == 1)
62
+ end
63
+
64
+ right_to_left_index
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,66 @@
1
+ module CommaSplice
2
+ class LineCorrector
3
+ attr_reader :headers, :values, :header_line, :value_line, :right_bounds, :left_bounds
4
+
5
+ def initialize(header_line, value_line, left_bounds = 0, right_bounds = -1)
6
+ header_line = Line.new(header_line) unless header_line.is_a?(Line)
7
+ value_line = Line.new(value_line) unless value_line.is_a?(Line)
8
+
9
+ @header_line = header_line
10
+ @value_line = value_line
11
+ @headers = header_line.values
12
+ @values = value_line.values
13
+ @left_bounds = left_bounds
14
+ @right_bounds = right_bounds
15
+
16
+ raise 'right bounds must be less than -1' unless right_bounds < 0
17
+ raise 'left bounds must be greater than zero' unless left_bounds >= 0
18
+ end
19
+
20
+ def needs_correcting?
21
+ @values && @values.size > 0 && @headers.size != @values.size
22
+ end
23
+
24
+ def original
25
+ @values.join(',')
26
+ end
27
+
28
+ def corrected
29
+ # you want to provide this with the smallest set of possibilities
30
+ # for performance reasons. Left and right bounds limit the values
31
+ # where the comma error could be
32
+
33
+ # For instance, with the following headers:
34
+ # [playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest]
35
+ # the only values that could contain an extra comma are "artist,title,albumtitle,label"
36
+ # therefore our left_bounds = 4, right_bounds = -5
37
+
38
+ values_before = if left_bounds > 0
39
+ values[0..(left_bounds - 1)]
40
+ else
41
+ []
42
+ end
43
+
44
+ values_after = if right_bounds < -1
45
+ values[(right_bounds + 1)..-1]
46
+ else
47
+ []
48
+ end
49
+ [values_before, corrector.correction, values_after].flatten.join(',')
50
+ end
51
+
52
+ private
53
+
54
+ def corrector
55
+ CommaCalculator.new(selected_headers, selected_values)
56
+ end
57
+
58
+ def selected_headers
59
+ headers[left_bounds..right_bounds]
60
+ end
61
+
62
+ def selected_values
63
+ values[left_bounds..right_bounds]
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,3 @@
1
+ module CommaSplice
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,150 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: comma_splice
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jeff Keen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-08-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: byebug
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: activesupport
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: thor
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: ''
98
+ email:
99
+ - jeff@keen.me
100
+ executables:
101
+ - comma_splice
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - ".gitignore"
106
+ - ".rspec"
107
+ - Gemfile
108
+ - Gemfile.lock
109
+ - LICENSE.txt
110
+ - README.md
111
+ - Rakefile
112
+ - bin/comma_splice
113
+ - bin/console
114
+ - bin/setup
115
+ - comma_splice.gemspec
116
+ - lib/comma_splice.rb
117
+ - lib/comma_splice/file_corrector.rb
118
+ - lib/comma_splice/helpers/comma_calculator.rb
119
+ - lib/comma_splice/helpers/content_finder.rb
120
+ - lib/comma_splice/helpers/join_possibilities.rb
121
+ - lib/comma_splice/helpers/line.rb
122
+ - lib/comma_splice/helpers/variable_column_finder.rb
123
+ - lib/comma_splice/line_corrector.rb
124
+ - lib/comma_splice/version.rb
125
+ homepage: http://github.com/jkeen/comma_splice
126
+ licenses:
127
+ - MIT
128
+ metadata:
129
+ allowed_push_host: https://rubygems.org
130
+ post_install_message:
131
+ rdoc_options: []
132
+ require_paths:
133
+ - lib
134
+ required_ruby_version: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ required_rubygems_version: !ruby/object:Gem::Requirement
140
+ requirements:
141
+ - - ">="
142
+ - !ruby/object:Gem::Version
143
+ version: '0'
144
+ requirements: []
145
+ rubyforge_project:
146
+ rubygems_version: 2.7.8
147
+ signing_key:
148
+ specification_version: 4
149
+ summary: Fixes CSVs with unescaped commas
150
+ test_files: []