comma_splice 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +52 -0
- data/LICENSE.txt +21 -0
- data/README.md +139 -0
- data/Rakefile +2 -0
- data/bin/comma_splice +56 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/comma_splice.gemspec +46 -0
- data/lib/comma_splice.rb +18 -0
- data/lib/comma_splice/file_corrector.rb +81 -0
- data/lib/comma_splice/helpers/comma_calculator.rb +86 -0
- data/lib/comma_splice/helpers/content_finder.rb +47 -0
- data/lib/comma_splice/helpers/join_possibilities.rb +43 -0
- data/lib/comma_splice/helpers/line.rb +21 -0
- data/lib/comma_splice/helpers/variable_column_finder.rb +67 -0
- data/lib/comma_splice/line_corrector.rb +66 -0
- data/lib/comma_splice/version.rb +3 -0
- metadata +150 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e0a958c45c6fe94fbc6a4af1cb5fc2a0c20cf7a68728520d227b62a79e2078a5
|
4
|
+
data.tar.gz: 7e4e09b4a3308a2ccb2c5a58d1b56ced9f6f5927bf41d89680c4401b5aad2b5d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1daff9cfdd5f54c37bd47edc83b02308a0f3fe7b0d8af86dd0a910e264929a5b5b1191386b3b06df42f157770c4bc733c656e3beb4d9bafe10d7bc65a5483185
|
7
|
+
data.tar.gz: f6e4c32e2e03d3bf915a9b8360146eaa34ed9b34a954cb4b9a74b08c68ef4f8ae15cba83c854f96b88b098b791f802200992e46194a373e27cdd388c40311136
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
comma_splice (0.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
activesupport (5.2.3)
|
10
|
+
concurrent-ruby (~> 1.0, >= 1.0.2)
|
11
|
+
i18n (>= 0.7, < 2)
|
12
|
+
minitest (~> 5.1)
|
13
|
+
tzinfo (~> 1.1)
|
14
|
+
byebug (11.0.1)
|
15
|
+
concurrent-ruby (1.1.5)
|
16
|
+
diff-lcs (1.3)
|
17
|
+
i18n (1.6.0)
|
18
|
+
concurrent-ruby (~> 1.0)
|
19
|
+
minitest (5.11.3)
|
20
|
+
rake (10.5.0)
|
21
|
+
rspec (3.8.0)
|
22
|
+
rspec-core (~> 3.8.0)
|
23
|
+
rspec-expectations (~> 3.8.0)
|
24
|
+
rspec-mocks (~> 3.8.0)
|
25
|
+
rspec-core (3.8.2)
|
26
|
+
rspec-support (~> 3.8.0)
|
27
|
+
rspec-expectations (3.8.4)
|
28
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
29
|
+
rspec-support (~> 3.8.0)
|
30
|
+
rspec-mocks (3.8.1)
|
31
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
32
|
+
rspec-support (~> 3.8.0)
|
33
|
+
rspec-support (3.8.2)
|
34
|
+
thor (0.20.3)
|
35
|
+
thread_safe (0.3.6)
|
36
|
+
tzinfo (1.2.5)
|
37
|
+
thread_safe (~> 0.1)
|
38
|
+
|
39
|
+
PLATFORMS
|
40
|
+
ruby
|
41
|
+
|
42
|
+
DEPENDENCIES
|
43
|
+
activesupport
|
44
|
+
bundler (~> 2.0)
|
45
|
+
byebug
|
46
|
+
comma_splice!
|
47
|
+
rake (~> 10.0)
|
48
|
+
rspec
|
49
|
+
thor
|
50
|
+
|
51
|
+
BUNDLED WITH
|
52
|
+
2.0.1
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2019 Jeff Keen
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
# Comma Splice
|
2
|
+
|
3
|
+
This gem tackles one very specific problem: when CSVs have commas in the values and the values haven't been quoted. This determines which commas separate fields and which commas are part of a value, and corrects the file.
|
4
|
+
|
5
|
+
For example, given the following CSV
|
6
|
+
|
7
|
+
```
|
8
|
+
timestamp,artist,title,albumtitle,label
|
9
|
+
01-27-2019 @ 12:34:00,Lester Sterling, Lynn Taitt & The Jets,Check Point Charlie,Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968,Dub Store,
|
10
|
+
01-27-2019 @ 12:31:00,Lester Sterling,Lester Sterling Special,Merritone Rock Steady 2: This Music Got Soul 1966-1967,Dub Store,
|
11
|
+
|
12
|
+
```
|
13
|
+
|
14
|
+
which parses incorrectly as:
|
15
|
+
|
16
|
+
| timestamp | artist | title | albumtitle | label |
|
17
|
+
|-----------------------|-----------------|-------------|-----------------|------------------------------------------------------------|
|
18
|
+
| 01-27-2019 @ 12:34:00 | Lester Sterling | Lynn Taitt & The Jets | Check Point Charlie | Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968
|
19
|
+
| 01-27-2019 @ 12:31:00 | Lester Sterling | Lester Sterling Special | Merritone Rock Steady 2: This Music Got Soul 1966-1967 | Dub Store |
|
20
|
+
|
21
|
+
|
22
|
+
Running this through `comma_splice fix /path/to/file` will return this corrected content:
|
23
|
+
|
24
|
+
```
|
25
|
+
timestamp,artist,title,albumtitle,label
|
26
|
+
01-27-2019 @ 12:34:00,"Lester Sterling, Lynn Taitt & The Jets",Check Point Charlie,Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968,Dub Store,
|
27
|
+
01-27-2019 @ 12:31:00,Lester Sterling,Lester Sterling Special,Merritone Rock Steady 2: This Music Got Soul 1966-1967,Dub Store,
|
28
|
+
```
|
29
|
+
|
30
|
+
| timestamp | artist | title | albumtitle | label |
|
31
|
+
|-----------------------|-----------------|-------------|-----------------|------------------------------------------------------------|
|
32
|
+
| 01-27-2019 @ 12:34:00 | Lester Sterling, Lynn Taitt & The Jets | Check Point Charlie | Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968 | Dub Store |
|
33
|
+
| 01-27-2019 @ 12:31:00 | Lester Sterling | Lester Sterling Special | Merritone Rock Steady 2: This Music Got Soul 1966-1967 | Dub Store |
|
34
|
+
|
35
|
+
|
36
|
+
If it can't determine where the comma should go, it prompts you for the possible options
|
37
|
+
|
38
|
+
|
39
|
+
given the following CSV:
|
40
|
+
|
41
|
+
```
|
42
|
+
playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest
|
43
|
+
16851097,,,12-09-2017 @ 09:57:00,10,000 Maniacs and Michael Stipe,To Sir with Love,Campfire Songs,Rhino,post,live,y,
|
44
|
+
16851096,,,12-09-2017 @ 09:44:00,Fran Jeffries,Mine Eyes,Fran Can Really Hang You Up the Most,Warwick,post,live,y,
|
45
|
+
```
|
46
|
+
|
47
|
+
It prompts:
|
48
|
+
|
49
|
+
```
|
50
|
+
Which one of these is correct?
|
51
|
+
|
52
|
+
(1) artist : 10
|
53
|
+
title : 000 Maniacs and Michael Stipe
|
54
|
+
albumtitle: To Sir with Love
|
55
|
+
label : "Campfire Songs,Rhino"
|
56
|
+
|
57
|
+
(2) artist : 10
|
58
|
+
title : 000 Maniacs and Michael Stipe
|
59
|
+
albumtitle: "To Sir with Love,Campfire Songs"
|
60
|
+
label : Rhino
|
61
|
+
|
62
|
+
(3) artist : 10
|
63
|
+
title : "000 Maniacs and Michael Stipe,To Sir with Love"
|
64
|
+
albumtitle: Campfire Songs
|
65
|
+
label : Rhino
|
66
|
+
|
67
|
+
(4) artist : "10,000 Maniacs and Michael Stipe"
|
68
|
+
title : To Sir with Love
|
69
|
+
albumtitle: Campfire Songs
|
70
|
+
label : Rhino
|
71
|
+
```
|
72
|
+
|
73
|
+
Select an option (4), and it returns:
|
74
|
+
|
75
|
+
```
|
76
|
+
playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest
|
77
|
+
16851097,,,12-09-2017 @ 09:57:00,"10,000 Maniacs and Michael Stipe",To Sir with Love,Campfire Songs,Rhino,post,live,y,
|
78
|
+
16851096,,,12-09-2017 @ 09:44:00,Fran Jeffries,Mine Eyes,Fran Can Really Hang You Up the Most,Warwick,post,live,y,
|
79
|
+
```
|
80
|
+
|
81
|
+
## Usage
|
82
|
+
|
83
|
+
You can use this in a ruby program by using installing the `comma_splice` gem, or you can install it on your system and use the `comma_splice` command line utility.
|
84
|
+
|
85
|
+
|
86
|
+
##### Return the number of bad lines in a file
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
CommaSplice::FileCorrector.new(file_path).bad_lines.size
|
90
|
+
```
|
91
|
+
```
|
92
|
+
comma_splice bad_line_count /path/to/file.csv
|
93
|
+
```
|
94
|
+
|
95
|
+
##### Display the fixed contents
|
96
|
+
```ruby
|
97
|
+
CommaSplice::FileCorrector.new(file_path).corrected
|
98
|
+
```
|
99
|
+
```bash
|
100
|
+
comma_splice correct /path/to/file.csv
|
101
|
+
```
|
102
|
+
|
103
|
+
##### Process a file and save the fixed version
|
104
|
+
```ruby
|
105
|
+
CommaSplice::FileCorrector.new(file_path).save(save_path)
|
106
|
+
```
|
107
|
+
```bash
|
108
|
+
comma_splice fix /path/to/file.csv /path/to/save
|
109
|
+
```
|
110
|
+
|
111
|
+
## Installation
|
112
|
+
|
113
|
+
Add this line to your application's Gemfile:
|
114
|
+
|
115
|
+
```ruby
|
116
|
+
gem 'comma_splice'
|
117
|
+
```
|
118
|
+
|
119
|
+
And then execute:
|
120
|
+
|
121
|
+
$ bundle
|
122
|
+
|
123
|
+
Or install it yourself as:
|
124
|
+
|
125
|
+
$ gem install comma_splice
|
126
|
+
|
127
|
+
## Development
|
128
|
+
|
129
|
+
After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
130
|
+
|
131
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
132
|
+
|
133
|
+
## Contributing
|
134
|
+
|
135
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/jkeen/comma_splice.
|
136
|
+
|
137
|
+
## License
|
138
|
+
|
139
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/bin/comma_splice
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'comma_splice'
|
5
|
+
require 'thor'
|
6
|
+
|
7
|
+
class CommaSpliceCLI < Thor
|
8
|
+
class_option :start_line, type: :numeric, default: nil
|
9
|
+
class_option :end_line, type: :numeric, default: nil
|
10
|
+
|
11
|
+
desc 'fix FILE_PATH [SAVE_PATH]', 'return corrected file contents'
|
12
|
+
def correct(file_path)
|
13
|
+
file_corrector = CommaSplice::FileCorrector.new(
|
14
|
+
file_path,
|
15
|
+
start_line: options[:start_line],
|
16
|
+
end_line: options[:end_line]
|
17
|
+
)
|
18
|
+
|
19
|
+
puts file_corrector.corrected
|
20
|
+
end
|
21
|
+
|
22
|
+
desc 'fix FILE_PATH [SAVE_PATH]', 'return corrected file contents'
|
23
|
+
def fix(file_path, fix_path)
|
24
|
+
file_corrector = CommaSplice::FileCorrector.new(
|
25
|
+
file_path,
|
26
|
+
start_line: options[:start_line],
|
27
|
+
end_line: options[:end_line]
|
28
|
+
)
|
29
|
+
|
30
|
+
file_corrector.save(fix_path)
|
31
|
+
end
|
32
|
+
|
33
|
+
desc 'bad_lines FILE_PATH', 'show bad lines'
|
34
|
+
def bad_lines(file_path)
|
35
|
+
file_corrector = CommaSplice::FileCorrector.new(
|
36
|
+
file_path,
|
37
|
+
start_line: options[:start_line],
|
38
|
+
end_line: options[:end_line]
|
39
|
+
)
|
40
|
+
|
41
|
+
puts file_corrector.bad_lines
|
42
|
+
end
|
43
|
+
|
44
|
+
desc 'bad_line_count FILE_PATH', 'check file contents for needed corrections'
|
45
|
+
def bad_line_count(file_path)
|
46
|
+
file_corrector = CommaSplice::FileCorrector.new(
|
47
|
+
file_path,
|
48
|
+
start_line: options[:start_line],
|
49
|
+
end_line: options[:end_line]
|
50
|
+
)
|
51
|
+
|
52
|
+
puts file_corrector.bad_lines.size
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
CommaSpliceCLI.start(ARGV)
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "comma_splice"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "comma_splice/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "comma_splice"
|
8
|
+
spec.version = CommaSplice::VERSION
|
9
|
+
spec.authors = ["Jeff Keen"]
|
10
|
+
spec.email = ["jeff@keen.me"]
|
11
|
+
|
12
|
+
spec.summary = %q{Fixes CSVs with unescaped commas}
|
13
|
+
spec.description = %q{}
|
14
|
+
spec.homepage = "http://github.com/jkeen/comma_splice"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
18
|
+
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
19
|
+
if spec.respond_to?(:metadata)
|
20
|
+
spec.metadata["allowed_push_host"] = 'https://rubygems.org'
|
21
|
+
|
22
|
+
# spec.metadata["homepage_uri"] = spec.homepage
|
23
|
+
# spec.metadata["source_code_uri"] = "http://github.com/jkeen/comma_splice"
|
24
|
+
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
25
|
+
else
|
26
|
+
raise "RubyGems 2.0 or newer is required to protect against " \
|
27
|
+
"public gem pushes."
|
28
|
+
end
|
29
|
+
|
30
|
+
# Specify which files should be added to the gem when it is released.
|
31
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
32
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
33
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
34
|
+
end
|
35
|
+
spec.bindir = "bin"
|
36
|
+
spec.executables << 'comma_splice'
|
37
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
38
|
+
spec.require_paths = ["lib"]
|
39
|
+
|
40
|
+
spec.add_development_dependency "bundler", "~> 2.0"
|
41
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
42
|
+
spec.add_development_dependency "rspec"
|
43
|
+
spec.add_development_dependency "byebug"
|
44
|
+
spec.add_development_dependency "activesupport"
|
45
|
+
spec.add_development_dependency "thor"
|
46
|
+
end
|
data/lib/comma_splice.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
require 'active_support/core_ext/string'
|
5
|
+
require 'comma_splice/version'
|
6
|
+
require 'comma_splice/helpers/content_finder'
|
7
|
+
require 'comma_splice/helpers/variable_column_finder'
|
8
|
+
require 'comma_splice/helpers/line'
|
9
|
+
require 'comma_splice/helpers/join_possibilities'
|
10
|
+
require 'comma_splice/helpers/comma_calculator'
|
11
|
+
|
12
|
+
require 'comma_splice/line_corrector'
|
13
|
+
require 'comma_splice/file_corrector'
|
14
|
+
require 'byebug'
|
15
|
+
|
16
|
+
module CommaSplice
|
17
|
+
class Error < StandardError; end
|
18
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module CommaSplice
|
2
|
+
class FileCorrector
|
3
|
+
attr_reader :file_contents, :csv_content, :start_line, :end_line, :start_column, :end_column
|
4
|
+
|
5
|
+
def initialize(file_path, start_line: nil, end_line:nil, start_column: nil, end_column: nil)
|
6
|
+
@file_path = file_path
|
7
|
+
@file_contents = File.read(file_path, encoding: 'utf-8')
|
8
|
+
|
9
|
+
@content_finder = ContentFinder.new(@file_contents, start_line, end_line)
|
10
|
+
@csv_content = @content_finder.content
|
11
|
+
@start_line = @content_finder.start_line
|
12
|
+
@end_line = @content_finder.start_line
|
13
|
+
|
14
|
+
if start_column && end_column
|
15
|
+
@start_column = start_column
|
16
|
+
@end_column = end_column
|
17
|
+
else
|
18
|
+
finder = VariableColumnFinder.new(@csv_content[0], @csv_content[1..-1])
|
19
|
+
@start_column = finder.start_column
|
20
|
+
@end_column = finder.end_column
|
21
|
+
end
|
22
|
+
|
23
|
+
raise CommaSplice::Error, "empty contents #{file_path}" unless @csv_content.present?
|
24
|
+
end
|
25
|
+
|
26
|
+
def header
|
27
|
+
@header ||= Line.new(csv_content.first)
|
28
|
+
end
|
29
|
+
|
30
|
+
def bad_lines
|
31
|
+
line_correctors.select(&:needs_correcting?).collect(&:original)
|
32
|
+
end
|
33
|
+
|
34
|
+
def needs_correcting?
|
35
|
+
bad_lines.size.positive?
|
36
|
+
end
|
37
|
+
|
38
|
+
def corrected
|
39
|
+
@corrected ||= [
|
40
|
+
@file_contents.lines[0, @start_line],
|
41
|
+
corrected_lines,
|
42
|
+
@file_contents.lines[@end_line, -1]
|
43
|
+
].flatten
|
44
|
+
end
|
45
|
+
|
46
|
+
def save!
|
47
|
+
save(@file_path)
|
48
|
+
end
|
49
|
+
|
50
|
+
def save(path)
|
51
|
+
File.open(path, 'w+') do |f|
|
52
|
+
corrected.each_with_index do |line, index|
|
53
|
+
# don't add an extra line break at the end
|
54
|
+
f.puts line if corrected.size > index && line
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def to_json
|
60
|
+
@content_finder.parsed.try(:to_json)
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
def line_correctors
|
66
|
+
@line_correctors ||= csv_content.collect do |line|
|
67
|
+
LineCorrector.new(header, Line.new(line), @start_column, @end_column)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def corrected_lines
|
72
|
+
line_correctors.collect do |line|
|
73
|
+
if line.needs_correcting?
|
74
|
+
line.corrected
|
75
|
+
else
|
76
|
+
line.original
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module CommaSplice
|
2
|
+
# provide an array of CSV headers and and array of CSV values
|
3
|
+
# and this will figure out the best correction and prompt
|
4
|
+
# you if it can't find out
|
5
|
+
|
6
|
+
class CommaCalculator
|
7
|
+
def initialize(headers, values)
|
8
|
+
@headers = headers
|
9
|
+
@values = values
|
10
|
+
|
11
|
+
raise StandardError, "Determining all the possibilities to fit #{@values.size} values into the #{@headers.size} headers #{@headers.inspect} is computationally expensive. Please specify the columns where commas might be." if @headers.size > 10 && @values.size > 10
|
12
|
+
end
|
13
|
+
|
14
|
+
def correction
|
15
|
+
if @headers.size === @values.size
|
16
|
+
@values
|
17
|
+
elsif best_options.size == 1
|
18
|
+
best_options.first
|
19
|
+
elsif best_options.size > 1
|
20
|
+
prompt_for_options(best_options)
|
21
|
+
else
|
22
|
+
prompt_for_options(all_options)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def all_options
|
27
|
+
options = join_possibilities.collect do |joins|
|
28
|
+
values = @values.dup
|
29
|
+
joins.collect do |join_num|
|
30
|
+
val = values.shift(join_num)
|
31
|
+
if val.size > 1
|
32
|
+
quoted_values(val)
|
33
|
+
else
|
34
|
+
val.join(',')
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def best_options
|
41
|
+
all_options.select do |option|
|
42
|
+
option.none? { |o| o.starts_with?(' ') || o.starts_with?('" ') }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def requires_manual_input?
|
47
|
+
needs_correcting? && best_options.many?
|
48
|
+
end
|
49
|
+
|
50
|
+
def needs_correcting?
|
51
|
+
@headers.size < @values.size
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def quoted_values(values)
|
57
|
+
"\"#{values.join(',').gsub(/(?<!")(?:"{2})*\K\"/, '""')}\"" # escape a double quote if it hasn't been escaped already
|
58
|
+
end
|
59
|
+
|
60
|
+
def join_possibilities
|
61
|
+
JoinPossibilities.new(@values.size, @headers.size).possibilities
|
62
|
+
end
|
63
|
+
|
64
|
+
def prompt_for_options(options)
|
65
|
+
longest_header = @headers.max_by(&:length)
|
66
|
+
|
67
|
+
options.each_with_index do |option, index|
|
68
|
+
@headers.each_with_index do |header, i|
|
69
|
+
marker = i.zero? ? "(#{index + 1})" : ''
|
70
|
+
puts marker.ljust(5) +
|
71
|
+
header.ljust(longest_header.size) + ': ' +
|
72
|
+
option[i]
|
73
|
+
end
|
74
|
+
puts "\n"
|
75
|
+
end
|
76
|
+
|
77
|
+
selected_option = nil
|
78
|
+
until selected_option && selected_option.to_i > 0
|
79
|
+
puts 'which one is correct?'
|
80
|
+
selected_option = STDIN.gets
|
81
|
+
end
|
82
|
+
|
83
|
+
options[selected_option.to_i - 1]
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module CommaSplice
|
2
|
+
# Given a file this will find the CSV content. Some files have some non-csv junk at the top
|
3
|
+
|
4
|
+
class ContentFinder
|
5
|
+
attr_reader :start_line, :end_line, :content
|
6
|
+
|
7
|
+
def initialize(file_contents, start_line = nil, end_line = nil)
|
8
|
+
@file_contents = file_contents
|
9
|
+
|
10
|
+
if start_line && end_line
|
11
|
+
# the csvs this was built for have non-csv headers
|
12
|
+
@start_line = start_line
|
13
|
+
@end_line = end_line
|
14
|
+
@content = @file_contents.lines[@start_line..@end_line]
|
15
|
+
else
|
16
|
+
find_content
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def find_content
|
21
|
+
@start_line = @file_contents.lines.find_index do |line|
|
22
|
+
Line.new(line).values.size > 2
|
23
|
+
end
|
24
|
+
|
25
|
+
relative_end_line = @file_contents.lines[@start_line..-1].find_index do |line|
|
26
|
+
Line.new(line).values.size < 2
|
27
|
+
end
|
28
|
+
|
29
|
+
if relative_end_line
|
30
|
+
@end_line = @start_line + relative_end_line - 1
|
31
|
+
else
|
32
|
+
@end_line = -1
|
33
|
+
end
|
34
|
+
|
35
|
+
@content = @file_contents.lines[@start_line..@end_line]
|
36
|
+
end
|
37
|
+
|
38
|
+
def parsed
|
39
|
+
quote_chars = %w[" | ~ ^ & *]
|
40
|
+
begin
|
41
|
+
CSV.parse(@content.join('\n'), quote_char: quote_chars.shift, headers: :first_row, liberal_parsing: true)
|
42
|
+
rescue CSV::MalformedCSVError
|
43
|
+
quote_chars.empty? ? raise : retry
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CommaSplice
|
4
|
+
class JoinPossibilities
|
5
|
+
attr_reader :from_size, :to_size
|
6
|
+
|
7
|
+
def initialize(value_count, header_count)
|
8
|
+
@from_size = value_count
|
9
|
+
@to_size = header_count
|
10
|
+
end
|
11
|
+
|
12
|
+
def possibilities
|
13
|
+
@possibilities ||= permutations(combos(from_size, to_size))
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def permutations(combinations)
|
19
|
+
# get all permutations of those combinations
|
20
|
+
# to determine every possibility of join
|
21
|
+
|
22
|
+
all_permutations = combinations.collect do |combo|
|
23
|
+
combo.permutation(to_size).to_a
|
24
|
+
end
|
25
|
+
|
26
|
+
# flatten down to a list of arrays
|
27
|
+
all_permutations.flatten(1).uniq
|
28
|
+
end
|
29
|
+
|
30
|
+
def combos(desired_size, count, minimum = 1)
|
31
|
+
# determine all combinations of [count] numbers that add up to [desired_size]
|
32
|
+
# e.g if we have an array of 6 items and want an array of 4 items
|
33
|
+
# we need 4 numbers that add up to 6, => [[1, 1, 1, 3], [1, 1, 2, 2]]
|
34
|
+
|
35
|
+
return [] if desired_size < count || desired_size < minimum
|
36
|
+
return [desired_size] if count == 1
|
37
|
+
|
38
|
+
(minimum..desired_size - 1).flat_map do |i|
|
39
|
+
combos(desired_size - i, count - 1, i).map { |r| [i, *r] }
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module CommaSplice
|
2
|
+
class Line
|
3
|
+
attr_reader :values, :line
|
4
|
+
|
5
|
+
def initialize(line)
|
6
|
+
@line = line
|
7
|
+
@values = parse_csv_content(line).first
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def parse_csv_content(content, headers = false)
|
13
|
+
quote_chars = %w[" | ~ ^ & *]
|
14
|
+
begin
|
15
|
+
CSV.parse(content.mb_chars.tidy_bytes.to_s, quote_char: quote_chars.shift, headers: headers, liberal_parsing: true)
|
16
|
+
rescue CSV::MalformedCSVError
|
17
|
+
quote_chars.empty? ? raise : retry
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CommaSplice
|
4
|
+
# Given a header line and some value lines this will try to figure out the columns
|
5
|
+
# where it's likely an error might be.
|
6
|
+
|
7
|
+
# Columns on the left and right bounds will be ignored if each line has the same length
|
8
|
+
|
9
|
+
# For example, the following CSV will evaluate with @start_column = 5 and @end_column = -4
|
10
|
+
# since in this example playid, playtime, genre, and timestamp are all non-variable on the left,
|
11
|
+
# and prepost, programtype, iswebcast, and isrequest are non-variable on the right
|
12
|
+
|
13
|
+
# playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest
|
14
|
+
# 17385098,,,01-27-2019 @ 13:58:00,Niney & Soul Syndicate,So Long Dub,Dub Box Set Vol. 2,Trojan,post,live,y,
|
15
|
+
# 17385097,,,01-27-2019 @ 13:57:00,King Tubby,Love Thy Neighbor,Jesus Dread,Blood & Fire,post,live,y,
|
16
|
+
# 17385096,,,01-27-2019 @ 13:53:00,King Tubby / The Aggrovators,Declaration Of Dub,Dub From The Roots,Charly,post,live,y,
|
17
|
+
# 17385095,,,01-27-2019 @ 13:50:00,Harry Mudie / King Tubby,Dub With A Difference,In Dub Conference Vol. 1,Moodisc,post,live,y,
|
18
|
+
# 17385094,,,01-27-2019 @ 13:47:00,KIng Tubby Meets The Upsetter,King And The Upsetter At Spanish Town,KIng Tubby Meets The Upsetter,Celluloid,post,live,y,
|
19
|
+
|
20
|
+
class VariableColumnFinder
|
21
|
+
attr_reader :start_column, :end_column
|
22
|
+
|
23
|
+
def initialize(header_line, value_lines)
|
24
|
+
@values = value_lines
|
25
|
+
@header = header_line
|
26
|
+
|
27
|
+
find_variable_column_boundaries
|
28
|
+
end
|
29
|
+
|
30
|
+
def find_variable_column_boundaries
|
31
|
+
# Now given both of these, we can eliminate some columns on the left and right
|
32
|
+
variables = left_to_right_index.zip(right_to_left_index).map do |pair|
|
33
|
+
pair == [false, false]
|
34
|
+
end
|
35
|
+
|
36
|
+
start_column = variables.find_index(true)
|
37
|
+
end_column = variables.reverse.find_index(true) * -1
|
38
|
+
|
39
|
+
@start_column = start_column
|
40
|
+
@end_column = end_column
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def left_to_right_index
|
46
|
+
left_to_right_index = []
|
47
|
+
@header.split(',').size.times do |time|
|
48
|
+
left_to_right_index.push(@values.map do |value_line|
|
49
|
+
value_line.split(',')[time].size
|
50
|
+
end.uniq.size == 1)
|
51
|
+
end
|
52
|
+
|
53
|
+
left_to_right_index
|
54
|
+
end
|
55
|
+
|
56
|
+
def right_to_left_index
|
57
|
+
right_to_left_index = []
|
58
|
+
@header.split(',').size.times do |time|
|
59
|
+
right_to_left_index.unshift(@values.map do |value_line|
|
60
|
+
value_line.split(',')[-time].size
|
61
|
+
end.uniq.size == 1)
|
62
|
+
end
|
63
|
+
|
64
|
+
right_to_left_index
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module CommaSplice
|
2
|
+
class LineCorrector
|
3
|
+
attr_reader :headers, :values, :header_line, :value_line, :right_bounds, :left_bounds
|
4
|
+
|
5
|
+
def initialize(header_line, value_line, left_bounds = 0, right_bounds = -1)
|
6
|
+
header_line = Line.new(header_line) unless header_line.is_a?(Line)
|
7
|
+
value_line = Line.new(value_line) unless value_line.is_a?(Line)
|
8
|
+
|
9
|
+
@header_line = header_line
|
10
|
+
@value_line = value_line
|
11
|
+
@headers = header_line.values
|
12
|
+
@values = value_line.values
|
13
|
+
@left_bounds = left_bounds
|
14
|
+
@right_bounds = right_bounds
|
15
|
+
|
16
|
+
raise 'right bounds must be less than -1' unless right_bounds < 0
|
17
|
+
raise 'left bounds must be greater than zero' unless left_bounds >= 0
|
18
|
+
end
|
19
|
+
|
20
|
+
def needs_correcting?
|
21
|
+
@values && @values.size > 0 && @headers.size != @values.size
|
22
|
+
end
|
23
|
+
|
24
|
+
def original
|
25
|
+
@values.join(',')
|
26
|
+
end
|
27
|
+
|
28
|
+
def corrected
|
29
|
+
# you want to provide this with the smallest set of possibilities
|
30
|
+
# for performance reasons. Left and right bounds limit the values
|
31
|
+
# where the comma error could be
|
32
|
+
|
33
|
+
# For instance, with the following headers:
|
34
|
+
# [playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest]
|
35
|
+
# the only values that could contain an extra comma are "artist,title,albumtitle,label"
|
36
|
+
# therefore our left_bounds = 4, right_bounds = -5
|
37
|
+
|
38
|
+
values_before = if left_bounds > 0
|
39
|
+
values[0..(left_bounds - 1)]
|
40
|
+
else
|
41
|
+
[]
|
42
|
+
end
|
43
|
+
|
44
|
+
values_after = if right_bounds < -1
|
45
|
+
values[(right_bounds + 1)..-1]
|
46
|
+
else
|
47
|
+
[]
|
48
|
+
end
|
49
|
+
[values_before, corrector.correction, values_after].flatten.join(',')
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def corrector
|
55
|
+
CommaCalculator.new(selected_headers, selected_values)
|
56
|
+
end
|
57
|
+
|
58
|
+
def selected_headers
|
59
|
+
headers[left_bounds..right_bounds]
|
60
|
+
end
|
61
|
+
|
62
|
+
def selected_values
|
63
|
+
values[left_bounds..right_bounds]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
metadata
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: comma_splice
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jeff Keen
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-08-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: byebug
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: activesupport
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: thor
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
description: ''
|
98
|
+
email:
|
99
|
+
- jeff@keen.me
|
100
|
+
executables:
|
101
|
+
- comma_splice
|
102
|
+
extensions: []
|
103
|
+
extra_rdoc_files: []
|
104
|
+
files:
|
105
|
+
- ".gitignore"
|
106
|
+
- ".rspec"
|
107
|
+
- Gemfile
|
108
|
+
- Gemfile.lock
|
109
|
+
- LICENSE.txt
|
110
|
+
- README.md
|
111
|
+
- Rakefile
|
112
|
+
- bin/comma_splice
|
113
|
+
- bin/console
|
114
|
+
- bin/setup
|
115
|
+
- comma_splice.gemspec
|
116
|
+
- lib/comma_splice.rb
|
117
|
+
- lib/comma_splice/file_corrector.rb
|
118
|
+
- lib/comma_splice/helpers/comma_calculator.rb
|
119
|
+
- lib/comma_splice/helpers/content_finder.rb
|
120
|
+
- lib/comma_splice/helpers/join_possibilities.rb
|
121
|
+
- lib/comma_splice/helpers/line.rb
|
122
|
+
- lib/comma_splice/helpers/variable_column_finder.rb
|
123
|
+
- lib/comma_splice/line_corrector.rb
|
124
|
+
- lib/comma_splice/version.rb
|
125
|
+
homepage: http://github.com/jkeen/comma_splice
|
126
|
+
licenses:
|
127
|
+
- MIT
|
128
|
+
metadata:
|
129
|
+
allowed_push_host: https://rubygems.org
|
130
|
+
post_install_message:
|
131
|
+
rdoc_options: []
|
132
|
+
require_paths:
|
133
|
+
- lib
|
134
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
140
|
+
requirements:
|
141
|
+
- - ">="
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
version: '0'
|
144
|
+
requirements: []
|
145
|
+
rubyforge_project:
|
146
|
+
rubygems_version: 2.7.8
|
147
|
+
signing_key:
|
148
|
+
specification_version: 4
|
149
|
+
summary: Fixes CSVs with unescaped commas
|
150
|
+
test_files: []
|