comma_splice 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +52 -0
- data/LICENSE.txt +21 -0
- data/README.md +139 -0
- data/Rakefile +2 -0
- data/bin/comma_splice +56 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/comma_splice.gemspec +46 -0
- data/lib/comma_splice.rb +18 -0
- data/lib/comma_splice/file_corrector.rb +81 -0
- data/lib/comma_splice/helpers/comma_calculator.rb +86 -0
- data/lib/comma_splice/helpers/content_finder.rb +47 -0
- data/lib/comma_splice/helpers/join_possibilities.rb +43 -0
- data/lib/comma_splice/helpers/line.rb +21 -0
- data/lib/comma_splice/helpers/variable_column_finder.rb +67 -0
- data/lib/comma_splice/line_corrector.rb +66 -0
- data/lib/comma_splice/version.rb +3 -0
- metadata +150 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e0a958c45c6fe94fbc6a4af1cb5fc2a0c20cf7a68728520d227b62a79e2078a5
|
4
|
+
data.tar.gz: 7e4e09b4a3308a2ccb2c5a58d1b56ced9f6f5927bf41d89680c4401b5aad2b5d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1daff9cfdd5f54c37bd47edc83b02308a0f3fe7b0d8af86dd0a910e264929a5b5b1191386b3b06df42f157770c4bc733c656e3beb4d9bafe10d7bc65a5483185
|
7
|
+
data.tar.gz: f6e4c32e2e03d3bf915a9b8360146eaa34ed9b34a954cb4b9a74b08c68ef4f8ae15cba83c854f96b88b098b791f802200992e46194a373e27cdd388c40311136
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
comma_splice (0.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
activesupport (5.2.3)
|
10
|
+
concurrent-ruby (~> 1.0, >= 1.0.2)
|
11
|
+
i18n (>= 0.7, < 2)
|
12
|
+
minitest (~> 5.1)
|
13
|
+
tzinfo (~> 1.1)
|
14
|
+
byebug (11.0.1)
|
15
|
+
concurrent-ruby (1.1.5)
|
16
|
+
diff-lcs (1.3)
|
17
|
+
i18n (1.6.0)
|
18
|
+
concurrent-ruby (~> 1.0)
|
19
|
+
minitest (5.11.3)
|
20
|
+
rake (10.5.0)
|
21
|
+
rspec (3.8.0)
|
22
|
+
rspec-core (~> 3.8.0)
|
23
|
+
rspec-expectations (~> 3.8.0)
|
24
|
+
rspec-mocks (~> 3.8.0)
|
25
|
+
rspec-core (3.8.2)
|
26
|
+
rspec-support (~> 3.8.0)
|
27
|
+
rspec-expectations (3.8.4)
|
28
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
29
|
+
rspec-support (~> 3.8.0)
|
30
|
+
rspec-mocks (3.8.1)
|
31
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
32
|
+
rspec-support (~> 3.8.0)
|
33
|
+
rspec-support (3.8.2)
|
34
|
+
thor (0.20.3)
|
35
|
+
thread_safe (0.3.6)
|
36
|
+
tzinfo (1.2.5)
|
37
|
+
thread_safe (~> 0.1)
|
38
|
+
|
39
|
+
PLATFORMS
|
40
|
+
ruby
|
41
|
+
|
42
|
+
DEPENDENCIES
|
43
|
+
activesupport
|
44
|
+
bundler (~> 2.0)
|
45
|
+
byebug
|
46
|
+
comma_splice!
|
47
|
+
rake (~> 10.0)
|
48
|
+
rspec
|
49
|
+
thor
|
50
|
+
|
51
|
+
BUNDLED WITH
|
52
|
+
2.0.1
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2019 Jeff Keen
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
# Comma Splice
|
2
|
+
|
3
|
+
This gem tackles one very specific problem: when CSVs have commas in the values and the values haven't been quoted. This determines which commas separate fields and which commas are part of a value, and corrects the file.
|
4
|
+
|
5
|
+
For example, given the following CSV
|
6
|
+
|
7
|
+
```
|
8
|
+
timestamp,artist,title,albumtitle,label
|
9
|
+
01-27-2019 @ 12:34:00,Lester Sterling, Lynn Taitt & The Jets,Check Point Charlie,Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968,Dub Store,
|
10
|
+
01-27-2019 @ 12:31:00,Lester Sterling,Lester Sterling Special,Merritone Rock Steady 2: This Music Got Soul 1966-1967,Dub Store,
|
11
|
+
|
12
|
+
```
|
13
|
+
|
14
|
+
which parses incorrectly as:
|
15
|
+
|
16
|
+
| timestamp | artist | title | albumtitle | label |
|
17
|
+
|-----------------------|-----------------|-------------|-----------------|------------------------------------------------------------|
|
18
|
+
| 01-27-2019 @ 12:34:00 | Lester Sterling | Lynn Taitt & The Jets | Check Point Charlie | Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968
|
19
|
+
| 01-27-2019 @ 12:31:00 | Lester Sterling | Lester Sterling Special | Merritone Rock Steady 2: This Music Got Soul 1966-1967 | Dub Store |
|
20
|
+
|
21
|
+
|
22
|
+
Running this through `comma_splice fix /path/to/file` will return this corrected content:
|
23
|
+
|
24
|
+
```
|
25
|
+
timestamp,artist,title,albumtitle,label
|
26
|
+
01-27-2019 @ 12:34:00,"Lester Sterling, Lynn Taitt & The Jets",Check Point Charlie,Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968,Dub Store,
|
27
|
+
01-27-2019 @ 12:31:00,Lester Sterling,Lester Sterling Special,Merritone Rock Steady 2: This Music Got Soul 1966-1967,Dub Store,
|
28
|
+
```
|
29
|
+
|
30
|
+
| timestamp | artist | title | albumtitle | label |
|
31
|
+
|-----------------------|-----------------|-------------|-----------------|------------------------------------------------------------|
|
32
|
+
| 01-27-2019 @ 12:34:00 | Lester Sterling, Lynn Taitt & The Jets | Check Point Charlie | Merritone Rock Steady 3: Bang Bang Rock Steady 1966-1968 | Dub Store |
|
33
|
+
| 01-27-2019 @ 12:31:00 | Lester Sterling | Lester Sterling Special | Merritone Rock Steady 2: This Music Got Soul 1966-1967 | Dub Store |
|
34
|
+
|
35
|
+
|
36
|
+
If it can't determine where the comma should go, it prompts you for the possible options
|
37
|
+
|
38
|
+
|
39
|
+
given the following CSV:
|
40
|
+
|
41
|
+
```
|
42
|
+
playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest
|
43
|
+
16851097,,,12-09-2017 @ 09:57:00,10,000 Maniacs and Michael Stipe,To Sir with Love,Campfire Songs,Rhino,post,live,y,
|
44
|
+
16851096,,,12-09-2017 @ 09:44:00,Fran Jeffries,Mine Eyes,Fran Can Really Hang You Up the Most,Warwick,post,live,y,
|
45
|
+
```
|
46
|
+
|
47
|
+
It prompts:
|
48
|
+
|
49
|
+
```
|
50
|
+
Which one of these is correct?
|
51
|
+
|
52
|
+
(1) artist : 10
|
53
|
+
title : 000 Maniacs and Michael Stipe
|
54
|
+
albumtitle: To Sir with Love
|
55
|
+
label : "Campfire Songs,Rhino"
|
56
|
+
|
57
|
+
(2) artist : 10
|
58
|
+
title : 000 Maniacs and Michael Stipe
|
59
|
+
albumtitle: "To Sir with Love,Campfire Songs"
|
60
|
+
label : Rhino
|
61
|
+
|
62
|
+
(3) artist : 10
|
63
|
+
title : "000 Maniacs and Michael Stipe,To Sir with Love"
|
64
|
+
albumtitle: Campfire Songs
|
65
|
+
label : Rhino
|
66
|
+
|
67
|
+
(4) artist : "10,000 Maniacs and Michael Stipe"
|
68
|
+
title : To Sir with Love
|
69
|
+
albumtitle: Campfire Songs
|
70
|
+
label : Rhino
|
71
|
+
```
|
72
|
+
|
73
|
+
Select an option (4), and it returns:
|
74
|
+
|
75
|
+
```
|
76
|
+
playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest
|
77
|
+
16851097,,,12-09-2017 @ 09:57:00,"10,000 Maniacs and Michael Stipe",To Sir with Love,Campfire Songs,Rhino,post,live,y,
|
78
|
+
16851096,,,12-09-2017 @ 09:44:00,Fran Jeffries,Mine Eyes,Fran Can Really Hang You Up the Most,Warwick,post,live,y,
|
79
|
+
```
|
80
|
+
|
81
|
+
## Usage
|
82
|
+
|
83
|
+
You can use this in a ruby program by using installing the `comma_splice` gem, or you can install it on your system and use the `comma_splice` command line utility.
|
84
|
+
|
85
|
+
|
86
|
+
##### Return the number of bad lines in a file
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
CommaSplice::FileCorrector.new(file_path).bad_lines.size
|
90
|
+
```
|
91
|
+
```
|
92
|
+
comma_splice bad_line_count /path/to/file.csv
|
93
|
+
```
|
94
|
+
|
95
|
+
##### Display the fixed contents
|
96
|
+
```ruby
|
97
|
+
CommaSplice::FileCorrector.new(file_path).corrected
|
98
|
+
```
|
99
|
+
```bash
|
100
|
+
comma_splice correct /path/to/file.csv
|
101
|
+
```
|
102
|
+
|
103
|
+
##### Process a file and save the fixed version
|
104
|
+
```ruby
|
105
|
+
CommaSplice::FileCorrector.new(file_path).save(save_path)
|
106
|
+
```
|
107
|
+
```bash
|
108
|
+
comma_splice fix /path/to/file.csv /path/to/save
|
109
|
+
```
|
110
|
+
|
111
|
+
## Installation
|
112
|
+
|
113
|
+
Add this line to your application's Gemfile:
|
114
|
+
|
115
|
+
```ruby
|
116
|
+
gem 'comma_splice'
|
117
|
+
```
|
118
|
+
|
119
|
+
And then execute:
|
120
|
+
|
121
|
+
$ bundle
|
122
|
+
|
123
|
+
Or install it yourself as:
|
124
|
+
|
125
|
+
$ gem install comma_splice
|
126
|
+
|
127
|
+
## Development
|
128
|
+
|
129
|
+
After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
130
|
+
|
131
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
132
|
+
|
133
|
+
## Contributing
|
134
|
+
|
135
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/jkeen/comma_splice.
|
136
|
+
|
137
|
+
## License
|
138
|
+
|
139
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/bin/comma_splice
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'comma_splice'
|
5
|
+
require 'thor'
|
6
|
+
|
7
|
+
class CommaSpliceCLI < Thor
|
8
|
+
class_option :start_line, type: :numeric, default: nil
|
9
|
+
class_option :end_line, type: :numeric, default: nil
|
10
|
+
|
11
|
+
desc 'fix FILE_PATH [SAVE_PATH]', 'return corrected file contents'
|
12
|
+
def correct(file_path)
|
13
|
+
file_corrector = CommaSplice::FileCorrector.new(
|
14
|
+
file_path,
|
15
|
+
start_line: options[:start_line],
|
16
|
+
end_line: options[:end_line]
|
17
|
+
)
|
18
|
+
|
19
|
+
puts file_corrector.corrected
|
20
|
+
end
|
21
|
+
|
22
|
+
desc 'fix FILE_PATH [SAVE_PATH]', 'return corrected file contents'
|
23
|
+
def fix(file_path, fix_path)
|
24
|
+
file_corrector = CommaSplice::FileCorrector.new(
|
25
|
+
file_path,
|
26
|
+
start_line: options[:start_line],
|
27
|
+
end_line: options[:end_line]
|
28
|
+
)
|
29
|
+
|
30
|
+
file_corrector.save(fix_path)
|
31
|
+
end
|
32
|
+
|
33
|
+
desc 'bad_lines FILE_PATH', 'show bad lines'
|
34
|
+
def bad_lines(file_path)
|
35
|
+
file_corrector = CommaSplice::FileCorrector.new(
|
36
|
+
file_path,
|
37
|
+
start_line: options[:start_line],
|
38
|
+
end_line: options[:end_line]
|
39
|
+
)
|
40
|
+
|
41
|
+
puts file_corrector.bad_lines
|
42
|
+
end
|
43
|
+
|
44
|
+
desc 'bad_line_count FILE_PATH', 'check file contents for needed corrections'
|
45
|
+
def bad_line_count(file_path)
|
46
|
+
file_corrector = CommaSplice::FileCorrector.new(
|
47
|
+
file_path,
|
48
|
+
start_line: options[:start_line],
|
49
|
+
end_line: options[:end_line]
|
50
|
+
)
|
51
|
+
|
52
|
+
puts file_corrector.bad_lines.size
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
CommaSpliceCLI.start(ARGV)
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "comma_splice"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "comma_splice/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "comma_splice"
|
8
|
+
spec.version = CommaSplice::VERSION
|
9
|
+
spec.authors = ["Jeff Keen"]
|
10
|
+
spec.email = ["jeff@keen.me"]
|
11
|
+
|
12
|
+
spec.summary = %q{Fixes CSVs with unescaped commas}
|
13
|
+
spec.description = %q{}
|
14
|
+
spec.homepage = "http://github.com/jkeen/comma_splice"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
18
|
+
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
19
|
+
if spec.respond_to?(:metadata)
|
20
|
+
spec.metadata["allowed_push_host"] = 'https://rubygems.org'
|
21
|
+
|
22
|
+
# spec.metadata["homepage_uri"] = spec.homepage
|
23
|
+
# spec.metadata["source_code_uri"] = "http://github.com/jkeen/comma_splice"
|
24
|
+
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
25
|
+
else
|
26
|
+
raise "RubyGems 2.0 or newer is required to protect against " \
|
27
|
+
"public gem pushes."
|
28
|
+
end
|
29
|
+
|
30
|
+
# Specify which files should be added to the gem when it is released.
|
31
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
32
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
33
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
34
|
+
end
|
35
|
+
spec.bindir = "bin"
|
36
|
+
spec.executables << 'comma_splice'
|
37
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
38
|
+
spec.require_paths = ["lib"]
|
39
|
+
|
40
|
+
spec.add_development_dependency "bundler", "~> 2.0"
|
41
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
42
|
+
spec.add_development_dependency "rspec"
|
43
|
+
spec.add_development_dependency "byebug"
|
44
|
+
spec.add_development_dependency "activesupport"
|
45
|
+
spec.add_development_dependency "thor"
|
46
|
+
end
|
data/lib/comma_splice.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
require 'active_support/core_ext/string'
|
5
|
+
require 'comma_splice/version'
|
6
|
+
require 'comma_splice/helpers/content_finder'
|
7
|
+
require 'comma_splice/helpers/variable_column_finder'
|
8
|
+
require 'comma_splice/helpers/line'
|
9
|
+
require 'comma_splice/helpers/join_possibilities'
|
10
|
+
require 'comma_splice/helpers/comma_calculator'
|
11
|
+
|
12
|
+
require 'comma_splice/line_corrector'
|
13
|
+
require 'comma_splice/file_corrector'
|
14
|
+
require 'byebug'
|
15
|
+
|
16
|
+
module CommaSplice
|
17
|
+
class Error < StandardError; end
|
18
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module CommaSplice
|
2
|
+
class FileCorrector
|
3
|
+
attr_reader :file_contents, :csv_content, :start_line, :end_line, :start_column, :end_column
|
4
|
+
|
5
|
+
def initialize(file_path, start_line: nil, end_line:nil, start_column: nil, end_column: nil)
|
6
|
+
@file_path = file_path
|
7
|
+
@file_contents = File.read(file_path, encoding: 'utf-8')
|
8
|
+
|
9
|
+
@content_finder = ContentFinder.new(@file_contents, start_line, end_line)
|
10
|
+
@csv_content = @content_finder.content
|
11
|
+
@start_line = @content_finder.start_line
|
12
|
+
@end_line = @content_finder.start_line
|
13
|
+
|
14
|
+
if start_column && end_column
|
15
|
+
@start_column = start_column
|
16
|
+
@end_column = end_column
|
17
|
+
else
|
18
|
+
finder = VariableColumnFinder.new(@csv_content[0], @csv_content[1..-1])
|
19
|
+
@start_column = finder.start_column
|
20
|
+
@end_column = finder.end_column
|
21
|
+
end
|
22
|
+
|
23
|
+
raise CommaSplice::Error, "empty contents #{file_path}" unless @csv_content.present?
|
24
|
+
end
|
25
|
+
|
26
|
+
def header
|
27
|
+
@header ||= Line.new(csv_content.first)
|
28
|
+
end
|
29
|
+
|
30
|
+
def bad_lines
|
31
|
+
line_correctors.select(&:needs_correcting?).collect(&:original)
|
32
|
+
end
|
33
|
+
|
34
|
+
def needs_correcting?
|
35
|
+
bad_lines.size.positive?
|
36
|
+
end
|
37
|
+
|
38
|
+
def corrected
|
39
|
+
@corrected ||= [
|
40
|
+
@file_contents.lines[0, @start_line],
|
41
|
+
corrected_lines,
|
42
|
+
@file_contents.lines[@end_line, -1]
|
43
|
+
].flatten
|
44
|
+
end
|
45
|
+
|
46
|
+
def save!
|
47
|
+
save(@file_path)
|
48
|
+
end
|
49
|
+
|
50
|
+
def save(path)
|
51
|
+
File.open(path, 'w+') do |f|
|
52
|
+
corrected.each_with_index do |line, index|
|
53
|
+
# don't add an extra line break at the end
|
54
|
+
f.puts line if corrected.size > index && line
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def to_json
|
60
|
+
@content_finder.parsed.try(:to_json)
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
def line_correctors
|
66
|
+
@line_correctors ||= csv_content.collect do |line|
|
67
|
+
LineCorrector.new(header, Line.new(line), @start_column, @end_column)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def corrected_lines
|
72
|
+
line_correctors.collect do |line|
|
73
|
+
if line.needs_correcting?
|
74
|
+
line.corrected
|
75
|
+
else
|
76
|
+
line.original
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module CommaSplice
|
2
|
+
# provide an array of CSV headers and and array of CSV values
|
3
|
+
# and this will figure out the best correction and prompt
|
4
|
+
# you if it can't find out
|
5
|
+
|
6
|
+
class CommaCalculator
|
7
|
+
def initialize(headers, values)
|
8
|
+
@headers = headers
|
9
|
+
@values = values
|
10
|
+
|
11
|
+
raise StandardError, "Determining all the possibilities to fit #{@values.size} values into the #{@headers.size} headers #{@headers.inspect} is computationally expensive. Please specify the columns where commas might be." if @headers.size > 10 && @values.size > 10
|
12
|
+
end
|
13
|
+
|
14
|
+
def correction
|
15
|
+
if @headers.size === @values.size
|
16
|
+
@values
|
17
|
+
elsif best_options.size == 1
|
18
|
+
best_options.first
|
19
|
+
elsif best_options.size > 1
|
20
|
+
prompt_for_options(best_options)
|
21
|
+
else
|
22
|
+
prompt_for_options(all_options)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def all_options
|
27
|
+
options = join_possibilities.collect do |joins|
|
28
|
+
values = @values.dup
|
29
|
+
joins.collect do |join_num|
|
30
|
+
val = values.shift(join_num)
|
31
|
+
if val.size > 1
|
32
|
+
quoted_values(val)
|
33
|
+
else
|
34
|
+
val.join(',')
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def best_options
|
41
|
+
all_options.select do |option|
|
42
|
+
option.none? { |o| o.starts_with?(' ') || o.starts_with?('" ') }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def requires_manual_input?
|
47
|
+
needs_correcting? && best_options.many?
|
48
|
+
end
|
49
|
+
|
50
|
+
def needs_correcting?
|
51
|
+
@headers.size < @values.size
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def quoted_values(values)
|
57
|
+
"\"#{values.join(',').gsub(/(?<!")(?:"{2})*\K\"/, '""')}\"" # escape a double quote if it hasn't been escaped already
|
58
|
+
end
|
59
|
+
|
60
|
+
def join_possibilities
|
61
|
+
JoinPossibilities.new(@values.size, @headers.size).possibilities
|
62
|
+
end
|
63
|
+
|
64
|
+
def prompt_for_options(options)
|
65
|
+
longest_header = @headers.max_by(&:length)
|
66
|
+
|
67
|
+
options.each_with_index do |option, index|
|
68
|
+
@headers.each_with_index do |header, i|
|
69
|
+
marker = i.zero? ? "(#{index + 1})" : ''
|
70
|
+
puts marker.ljust(5) +
|
71
|
+
header.ljust(longest_header.size) + ': ' +
|
72
|
+
option[i]
|
73
|
+
end
|
74
|
+
puts "\n"
|
75
|
+
end
|
76
|
+
|
77
|
+
selected_option = nil
|
78
|
+
until selected_option && selected_option.to_i > 0
|
79
|
+
puts 'which one is correct?'
|
80
|
+
selected_option = STDIN.gets
|
81
|
+
end
|
82
|
+
|
83
|
+
options[selected_option.to_i - 1]
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module CommaSplice
|
2
|
+
# Given a file this will find the CSV content. Some files have some non-csv junk at the top
|
3
|
+
|
4
|
+
class ContentFinder
|
5
|
+
attr_reader :start_line, :end_line, :content
|
6
|
+
|
7
|
+
def initialize(file_contents, start_line = nil, end_line = nil)
|
8
|
+
@file_contents = file_contents
|
9
|
+
|
10
|
+
if start_line && end_line
|
11
|
+
# the csvs this was built for have non-csv headers
|
12
|
+
@start_line = start_line
|
13
|
+
@end_line = end_line
|
14
|
+
@content = @file_contents.lines[@start_line..@end_line]
|
15
|
+
else
|
16
|
+
find_content
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def find_content
|
21
|
+
@start_line = @file_contents.lines.find_index do |line|
|
22
|
+
Line.new(line).values.size > 2
|
23
|
+
end
|
24
|
+
|
25
|
+
relative_end_line = @file_contents.lines[@start_line..-1].find_index do |line|
|
26
|
+
Line.new(line).values.size < 2
|
27
|
+
end
|
28
|
+
|
29
|
+
if relative_end_line
|
30
|
+
@end_line = @start_line + relative_end_line - 1
|
31
|
+
else
|
32
|
+
@end_line = -1
|
33
|
+
end
|
34
|
+
|
35
|
+
@content = @file_contents.lines[@start_line..@end_line]
|
36
|
+
end
|
37
|
+
|
38
|
+
def parsed
|
39
|
+
quote_chars = %w[" | ~ ^ & *]
|
40
|
+
begin
|
41
|
+
CSV.parse(@content.join('\n'), quote_char: quote_chars.shift, headers: :first_row, liberal_parsing: true)
|
42
|
+
rescue CSV::MalformedCSVError
|
43
|
+
quote_chars.empty? ? raise : retry
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CommaSplice
|
4
|
+
class JoinPossibilities
|
5
|
+
attr_reader :from_size, :to_size
|
6
|
+
|
7
|
+
def initialize(value_count, header_count)
|
8
|
+
@from_size = value_count
|
9
|
+
@to_size = header_count
|
10
|
+
end
|
11
|
+
|
12
|
+
def possibilities
|
13
|
+
@possibilities ||= permutations(combos(from_size, to_size))
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def permutations(combinations)
|
19
|
+
# get all permutations of those combinations
|
20
|
+
# to determine every possibility of join
|
21
|
+
|
22
|
+
all_permutations = combinations.collect do |combo|
|
23
|
+
combo.permutation(to_size).to_a
|
24
|
+
end
|
25
|
+
|
26
|
+
# flatten down to a list of arrays
|
27
|
+
all_permutations.flatten(1).uniq
|
28
|
+
end
|
29
|
+
|
30
|
+
def combos(desired_size, count, minimum = 1)
|
31
|
+
# determine all combinations of [count] numbers that add up to [desired_size]
|
32
|
+
# e.g if we have an array of 6 items and want an array of 4 items
|
33
|
+
# we need 4 numbers that add up to 6, => [[1, 1, 1, 3], [1, 1, 2, 2]]
|
34
|
+
|
35
|
+
return [] if desired_size < count || desired_size < minimum
|
36
|
+
return [desired_size] if count == 1
|
37
|
+
|
38
|
+
(minimum..desired_size - 1).flat_map do |i|
|
39
|
+
combos(desired_size - i, count - 1, i).map { |r| [i, *r] }
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module CommaSplice
|
2
|
+
class Line
|
3
|
+
attr_reader :values, :line
|
4
|
+
|
5
|
+
def initialize(line)
|
6
|
+
@line = line
|
7
|
+
@values = parse_csv_content(line).first
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def parse_csv_content(content, headers = false)
|
13
|
+
quote_chars = %w[" | ~ ^ & *]
|
14
|
+
begin
|
15
|
+
CSV.parse(content.mb_chars.tidy_bytes.to_s, quote_char: quote_chars.shift, headers: headers, liberal_parsing: true)
|
16
|
+
rescue CSV::MalformedCSVError
|
17
|
+
quote_chars.empty? ? raise : retry
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CommaSplice
|
4
|
+
# Given a header line and some value lines this will try to figure out the columns
|
5
|
+
# where it's likely an error might be.
|
6
|
+
|
7
|
+
# Columns on the left and right bounds will be ignored if each line has the same length
|
8
|
+
|
9
|
+
# For example, the following CSV will evaluate with @start_column = 5 and @end_column = -4
|
10
|
+
# since in this example playid, playtime, genre, and timestamp are all non-variable on the left,
|
11
|
+
# and prepost, programtype, iswebcast, and isrequest are non-variable on the right
|
12
|
+
|
13
|
+
# playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest
|
14
|
+
# 17385098,,,01-27-2019 @ 13:58:00,Niney & Soul Syndicate,So Long Dub,Dub Box Set Vol. 2,Trojan,post,live,y,
|
15
|
+
# 17385097,,,01-27-2019 @ 13:57:00,King Tubby,Love Thy Neighbor,Jesus Dread,Blood & Fire,post,live,y,
|
16
|
+
# 17385096,,,01-27-2019 @ 13:53:00,King Tubby / The Aggrovators,Declaration Of Dub,Dub From The Roots,Charly,post,live,y,
|
17
|
+
# 17385095,,,01-27-2019 @ 13:50:00,Harry Mudie / King Tubby,Dub With A Difference,In Dub Conference Vol. 1,Moodisc,post,live,y,
|
18
|
+
# 17385094,,,01-27-2019 @ 13:47:00,KIng Tubby Meets The Upsetter,King And The Upsetter At Spanish Town,KIng Tubby Meets The Upsetter,Celluloid,post,live,y,
|
19
|
+
|
20
|
+
class VariableColumnFinder
|
21
|
+
attr_reader :start_column, :end_column
|
22
|
+
|
23
|
+
def initialize(header_line, value_lines)
|
24
|
+
@values = value_lines
|
25
|
+
@header = header_line
|
26
|
+
|
27
|
+
find_variable_column_boundaries
|
28
|
+
end
|
29
|
+
|
30
|
+
def find_variable_column_boundaries
|
31
|
+
# Now given both of these, we can eliminate some columns on the left and right
|
32
|
+
variables = left_to_right_index.zip(right_to_left_index).map do |pair|
|
33
|
+
pair == [false, false]
|
34
|
+
end
|
35
|
+
|
36
|
+
start_column = variables.find_index(true)
|
37
|
+
end_column = variables.reverse.find_index(true) * -1
|
38
|
+
|
39
|
+
@start_column = start_column
|
40
|
+
@end_column = end_column
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def left_to_right_index
|
46
|
+
left_to_right_index = []
|
47
|
+
@header.split(',').size.times do |time|
|
48
|
+
left_to_right_index.push(@values.map do |value_line|
|
49
|
+
value_line.split(',')[time].size
|
50
|
+
end.uniq.size == 1)
|
51
|
+
end
|
52
|
+
|
53
|
+
left_to_right_index
|
54
|
+
end
|
55
|
+
|
56
|
+
def right_to_left_index
|
57
|
+
right_to_left_index = []
|
58
|
+
@header.split(',').size.times do |time|
|
59
|
+
right_to_left_index.unshift(@values.map do |value_line|
|
60
|
+
value_line.split(',')[-time].size
|
61
|
+
end.uniq.size == 1)
|
62
|
+
end
|
63
|
+
|
64
|
+
right_to_left_index
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module CommaSplice
|
2
|
+
class LineCorrector
|
3
|
+
attr_reader :headers, :values, :header_line, :value_line, :right_bounds, :left_bounds
|
4
|
+
|
5
|
+
def initialize(header_line, value_line, left_bounds = 0, right_bounds = -1)
|
6
|
+
header_line = Line.new(header_line) unless header_line.is_a?(Line)
|
7
|
+
value_line = Line.new(value_line) unless value_line.is_a?(Line)
|
8
|
+
|
9
|
+
@header_line = header_line
|
10
|
+
@value_line = value_line
|
11
|
+
@headers = header_line.values
|
12
|
+
@values = value_line.values
|
13
|
+
@left_bounds = left_bounds
|
14
|
+
@right_bounds = right_bounds
|
15
|
+
|
16
|
+
raise 'right bounds must be less than -1' unless right_bounds < 0
|
17
|
+
raise 'left bounds must be greater than zero' unless left_bounds >= 0
|
18
|
+
end
|
19
|
+
|
20
|
+
def needs_correcting?
|
21
|
+
@values && @values.size > 0 && @headers.size != @values.size
|
22
|
+
end
|
23
|
+
|
24
|
+
def original
|
25
|
+
@values.join(',')
|
26
|
+
end
|
27
|
+
|
28
|
+
def corrected
|
29
|
+
# you want to provide this with the smallest set of possibilities
|
30
|
+
# for performance reasons. Left and right bounds limit the values
|
31
|
+
# where the comma error could be
|
32
|
+
|
33
|
+
# For instance, with the following headers:
|
34
|
+
# [playid,playtype,genre,timestamp,artist,title,albumtitle,label,prepost,programtype,iswebcast,isrequest]
|
35
|
+
# the only values that could contain an extra comma are "artist,title,albumtitle,label"
|
36
|
+
# therefore our left_bounds = 4, right_bounds = -5
|
37
|
+
|
38
|
+
values_before = if left_bounds > 0
|
39
|
+
values[0..(left_bounds - 1)]
|
40
|
+
else
|
41
|
+
[]
|
42
|
+
end
|
43
|
+
|
44
|
+
values_after = if right_bounds < -1
|
45
|
+
values[(right_bounds + 1)..-1]
|
46
|
+
else
|
47
|
+
[]
|
48
|
+
end
|
49
|
+
[values_before, corrector.correction, values_after].flatten.join(',')
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def corrector
|
55
|
+
CommaCalculator.new(selected_headers, selected_values)
|
56
|
+
end
|
57
|
+
|
58
|
+
def selected_headers
|
59
|
+
headers[left_bounds..right_bounds]
|
60
|
+
end
|
61
|
+
|
62
|
+
def selected_values
|
63
|
+
values[left_bounds..right_bounds]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
metadata
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: comma_splice
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jeff Keen
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-08-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: byebug
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: activesupport
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: thor
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
description: ''
|
98
|
+
email:
|
99
|
+
- jeff@keen.me
|
100
|
+
executables:
|
101
|
+
- comma_splice
|
102
|
+
extensions: []
|
103
|
+
extra_rdoc_files: []
|
104
|
+
files:
|
105
|
+
- ".gitignore"
|
106
|
+
- ".rspec"
|
107
|
+
- Gemfile
|
108
|
+
- Gemfile.lock
|
109
|
+
- LICENSE.txt
|
110
|
+
- README.md
|
111
|
+
- Rakefile
|
112
|
+
- bin/comma_splice
|
113
|
+
- bin/console
|
114
|
+
- bin/setup
|
115
|
+
- comma_splice.gemspec
|
116
|
+
- lib/comma_splice.rb
|
117
|
+
- lib/comma_splice/file_corrector.rb
|
118
|
+
- lib/comma_splice/helpers/comma_calculator.rb
|
119
|
+
- lib/comma_splice/helpers/content_finder.rb
|
120
|
+
- lib/comma_splice/helpers/join_possibilities.rb
|
121
|
+
- lib/comma_splice/helpers/line.rb
|
122
|
+
- lib/comma_splice/helpers/variable_column_finder.rb
|
123
|
+
- lib/comma_splice/line_corrector.rb
|
124
|
+
- lib/comma_splice/version.rb
|
125
|
+
homepage: http://github.com/jkeen/comma_splice
|
126
|
+
licenses:
|
127
|
+
- MIT
|
128
|
+
metadata:
|
129
|
+
allowed_push_host: https://rubygems.org
|
130
|
+
post_install_message:
|
131
|
+
rdoc_options: []
|
132
|
+
require_paths:
|
133
|
+
- lib
|
134
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
140
|
+
requirements:
|
141
|
+
- - ">="
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
version: '0'
|
144
|
+
requirements: []
|
145
|
+
rubyforge_project:
|
146
|
+
rubygems_version: 2.7.8
|
147
|
+
signing_key:
|
148
|
+
specification_version: 4
|
149
|
+
summary: Fixes CSVs with unescaped commas
|
150
|
+
test_files: []
|