comma_splice 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Gemfile +0 -1
- data/Gemfile.lock +1 -1
- data/bin/comma_splice +6 -1
- data/lib/comma_splice/file_corrector.rb +5 -1
- data/lib/comma_splice/helpers/comma_calculator.rb +42 -21
- data/lib/comma_splice/helpers/content_finder.rb +8 -6
- data/lib/comma_splice/helpers/option_scorer.rb +114 -0
- data/lib/comma_splice/helpers/variable_column_finder.rb +2 -2
- data/lib/comma_splice/line_corrector.rb +10 -2
- data/lib/comma_splice/version.rb +1 -1
- data/lib/comma_splice.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bd44bca5b615b5f267ae81b16aaa6b379b53963e637af1cc000d6229996963d4
|
4
|
+
data.tar.gz: bd70a25937f8baf3cec97eeb5276ce050f3898a76f5be9cf8e0c8e726bc5a8fa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 991f2138d4d08941b1a83d231338a5197bce6bcea0e136c3cbb67ca67cf04b31ad425ce148d48f2bf5ca8280bfee2b736d7ace002cd043dac071bdee8ca2ad2f
|
7
|
+
data.tar.gz: 4d9e601ab359fe6511d9c54b3469104e64f057953924a7867c6b7e58df224c9b5d9b279060ff7e67e3d965da7460334ed23f97c1fd2792c32441e2cc9f8161da
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
### 0.2 (January 27, 2020)
|
4
|
+
- [IMPROVEMENT] Add scoring model to better handle cases that needed prompting before, like comma-separated numbers
|
5
|
+
- [IMPROVEMENT] Correct line escaping even on lines that don't have incorrect commas to ensure correct parsing of generated CSV down the line
|
6
|
+
- [IMPROVEMENT] Use ruby csv library to generate lines instead of handling escaping cases manually
|
7
|
+
|
3
8
|
### 0.1.1 (January 24, 2020)
|
4
9
|
- [BUGFIX] handle case where all columns are equal widths
|
5
10
|
- [BUGFIX] Improve error message
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
data/bin/comma_splice
CHANGED
@@ -8,7 +8,12 @@ class CommaSpliceCLI < Thor
|
|
8
8
|
class_option :start_line, type: :numeric, default: nil
|
9
9
|
class_option :end_line, type: :numeric, default: nil
|
10
10
|
|
11
|
-
desc '
|
11
|
+
desc 'version', 'print the current comma_splice version'
|
12
|
+
def version
|
13
|
+
puts CommaSplice::VERSION
|
14
|
+
end
|
15
|
+
|
16
|
+
desc 'correct FILE_PATH', 'return corrected file contents'
|
12
17
|
def correct(file_path)
|
13
18
|
file_corrector = CommaSplice::FileCorrector.new(
|
14
19
|
file_path,
|
@@ -9,7 +9,7 @@ module CommaSplice
|
|
9
9
|
@content_finder = ContentFinder.new(@file_contents, start_line, end_line)
|
10
10
|
@csv_content = @content_finder.content
|
11
11
|
@start_line = @content_finder.start_line
|
12
|
-
@end_line = @content_finder.
|
12
|
+
@end_line = @content_finder.end_line
|
13
13
|
|
14
14
|
if start_column && end_column
|
15
15
|
@start_column = start_column
|
@@ -35,6 +35,10 @@ module CommaSplice
|
|
35
35
|
bad_lines.size.positive?
|
36
36
|
end
|
37
37
|
|
38
|
+
def needs_manual_input?
|
39
|
+
line_correctors.any?(&:needs_manual_input?)
|
40
|
+
end
|
41
|
+
|
38
42
|
def corrected
|
39
43
|
@corrected ||= [
|
40
44
|
@file_contents.lines[0, @start_line],
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module CommaSplice
|
2
3
|
# provide an array of CSV headers and and array of CSV values
|
3
4
|
# and this will figure out the best correction and prompt
|
@@ -5,17 +6,17 @@ module CommaSplice
|
|
5
6
|
|
6
7
|
class CommaCalculator
|
7
8
|
def initialize(headers, values)
|
9
|
+
raise StandardError, "Determining all the possibilities to fit #{values.size} values into the #{headers.size} headers #{headers.inspect} is computationally expensive. Please specify the columns where commas might be." if headers.size > 10 && values.size > 10
|
10
|
+
|
8
11
|
@headers = headers
|
9
12
|
@values = values
|
10
|
-
|
11
|
-
raise StandardError, "Determining all the possibilities to fit #{@values.size} values into the #{@headers.size} headers #{@headers.inspect} is computationally expensive. Please specify the columns where commas might be." if @headers.size > 10 && @values.size > 10
|
12
13
|
end
|
13
14
|
|
14
15
|
def correction
|
15
|
-
if @headers.size
|
16
|
+
if @headers.size == @values.size
|
16
17
|
@values
|
17
18
|
elsif best_options.size == 1
|
18
|
-
best_options.first
|
19
|
+
best_options.first.option
|
19
20
|
elsif best_options.size > 1
|
20
21
|
prompt_for_options(best_options)
|
21
22
|
else
|
@@ -24,12 +25,14 @@ module CommaSplice
|
|
24
25
|
end
|
25
26
|
|
26
27
|
def all_options
|
27
|
-
|
28
|
+
@all_options ||= join_possibilities.collect do |joins|
|
28
29
|
values = @values.dup
|
29
30
|
joins.collect do |join_num|
|
30
31
|
val = values.shift(join_num)
|
31
|
-
if val.
|
32
|
-
|
32
|
+
if val.empty?
|
33
|
+
nil
|
34
|
+
elsif val.size == 1
|
35
|
+
val.first
|
33
36
|
else
|
34
37
|
val.join(',')
|
35
38
|
end
|
@@ -37,14 +40,23 @@ module CommaSplice
|
|
37
40
|
end
|
38
41
|
end
|
39
42
|
|
40
|
-
def
|
41
|
-
all_options.
|
42
|
-
|
43
|
+
def ranked_options
|
44
|
+
@ranked_options ||= all_options.collect do |option|
|
45
|
+
OptionScorer.new(option)
|
43
46
|
end
|
44
47
|
end
|
45
48
|
|
46
|
-
def
|
47
|
-
|
49
|
+
def score_option(option)
|
50
|
+
OptionScorer.new(option).score
|
51
|
+
end
|
52
|
+
|
53
|
+
def best_options
|
54
|
+
max_score = ranked_options.collect { |o| o.score }.max
|
55
|
+
ranked_options.select { |o| o.score == max_score }
|
56
|
+
end
|
57
|
+
|
58
|
+
def needs_manual_input?
|
59
|
+
!best_options.one?
|
48
60
|
end
|
49
61
|
|
50
62
|
def needs_correcting?
|
@@ -53,10 +65,6 @@ module CommaSplice
|
|
53
65
|
|
54
66
|
private
|
55
67
|
|
56
|
-
def quoted_values(values)
|
57
|
-
"\"#{values.join(',').gsub(/(?<!")(?:"{2})*\K\"/, '""')}\"" # escape a double quote if it hasn't been escaped already
|
58
|
-
end
|
59
|
-
|
60
68
|
def join_possibilities
|
61
69
|
JoinPossibilities.new(@values.size, @headers.size).possibilities
|
62
70
|
end
|
@@ -66,21 +74,34 @@ module CommaSplice
|
|
66
74
|
|
67
75
|
options.each_with_index do |option, index|
|
68
76
|
@headers.each_with_index do |header, i|
|
69
|
-
marker = i.zero?
|
70
|
-
|
77
|
+
marker = if i.zero?
|
78
|
+
"(#{index + 1})"
|
79
|
+
else
|
80
|
+
''
|
81
|
+
end
|
82
|
+
|
83
|
+
puts marker.ljust(7) +
|
71
84
|
header.ljust(longest_header.size) + ': ' +
|
72
|
-
option[i]
|
85
|
+
option.option[i].to_s
|
73
86
|
end
|
87
|
+
puts ''.ljust(7) + "(score = #{option.score})"
|
88
|
+
puts ''.ljust(7) + option.breakdown
|
74
89
|
puts "\n"
|
75
90
|
end
|
76
91
|
|
92
|
+
puts "press 0 to see all options" if all_options.size != options.size
|
93
|
+
|
77
94
|
selected_option = nil
|
78
|
-
until selected_option && selected_option.to_i >
|
95
|
+
until selected_option && selected_option.to_i > -1
|
79
96
|
puts 'which one is correct?'
|
80
97
|
selected_option = STDIN.gets
|
81
98
|
end
|
82
99
|
|
83
|
-
|
100
|
+
if selected_option.to_i == 0
|
101
|
+
prompt_for_options(ranked_options.sort_by { |s| s.score.to_i }.reverse)
|
102
|
+
else
|
103
|
+
options[selected_option.to_i - 1].option
|
104
|
+
end
|
84
105
|
end
|
85
106
|
end
|
86
107
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module CommaSplice
|
2
4
|
# Given a file this will find the CSV content. Some files have some non-csv junk at the top
|
3
5
|
|
@@ -26,11 +28,11 @@ module CommaSplice
|
|
26
28
|
Line.new(line).values.size < 2
|
27
29
|
end
|
28
30
|
|
29
|
-
if relative_end_line
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
31
|
+
@end_line = if relative_end_line
|
32
|
+
@start_line + relative_end_line - 1
|
33
|
+
else
|
34
|
+
-1
|
35
|
+
end
|
34
36
|
|
35
37
|
@content = @file_contents.lines[@start_line..@end_line]
|
36
38
|
end
|
@@ -38,7 +40,7 @@ module CommaSplice
|
|
38
40
|
def parsed
|
39
41
|
quote_chars = %w[" | ~ ^ & *]
|
40
42
|
begin
|
41
|
-
CSV.parse(@content.join(
|
43
|
+
CSV.parse(@content.join("\n"), quote_char: quote_chars.shift, headers: :first_row, liberal_parsing: true)
|
42
44
|
rescue CSV::MalformedCSVError
|
43
45
|
quote_chars.empty? ? raise : retry
|
44
46
|
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
module CommaSplice
|
2
|
+
# scores options based on how likely they are to be correct
|
3
|
+
|
4
|
+
class OptionScorer
|
5
|
+
attr_reader :option
|
6
|
+
|
7
|
+
def initialize(option)
|
8
|
+
@option = option
|
9
|
+
@start_score = 100
|
10
|
+
end
|
11
|
+
|
12
|
+
def breakdown
|
13
|
+
score = @start_score
|
14
|
+
breakdown = []
|
15
|
+
|
16
|
+
rules.each do |rule|
|
17
|
+
rule_score = send(rule.to_sym)
|
18
|
+
score += rule_score
|
19
|
+
if rule_score != 0
|
20
|
+
breakdown << "#{rule_score.to_s.ljust(3)} #{rule.to_sym}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
breakdown.join("\n")
|
25
|
+
end
|
26
|
+
|
27
|
+
def score
|
28
|
+
score = @start_score
|
29
|
+
rules.each do |rule|
|
30
|
+
score += send(rule.to_sym)
|
31
|
+
end
|
32
|
+
score
|
33
|
+
end
|
34
|
+
|
35
|
+
def options_that_start_with_a_space
|
36
|
+
option.select do |o|
|
37
|
+
o.to_s.starts_with?(' ')
|
38
|
+
end.size * -10
|
39
|
+
end
|
40
|
+
|
41
|
+
def options_that_start_with_a_quote_followed_by_a_space
|
42
|
+
option.select do |o|
|
43
|
+
o.to_s.starts_with?('" ')
|
44
|
+
end.size * -1
|
45
|
+
end
|
46
|
+
|
47
|
+
def options_that_start_with_a_comma
|
48
|
+
option.select do |o|
|
49
|
+
o.to_s.starts_with?(',')
|
50
|
+
end.size * -5
|
51
|
+
end
|
52
|
+
|
53
|
+
def options_that_end_with_a_comma
|
54
|
+
option.select do |o|
|
55
|
+
o.to_s.ends_with?(',')
|
56
|
+
end.size * -5
|
57
|
+
end
|
58
|
+
|
59
|
+
def options_that_have_words_joined_by_commas
|
60
|
+
option.select do |o|
|
61
|
+
o.to_s.match(/[A-Za-z],[A-Za-z]/)
|
62
|
+
end.compact.size * -5
|
63
|
+
end
|
64
|
+
|
65
|
+
def options_that_are_blank
|
66
|
+
option.select do |o|
|
67
|
+
o.to_s.strip.blank?
|
68
|
+
end.size * -5
|
69
|
+
end
|
70
|
+
|
71
|
+
def options_that_have_longest_comma_separated_number
|
72
|
+
# favor items that have a longer comma separated number
|
73
|
+
# i.e in the following example, option 1 should win
|
74
|
+
# (1) artist : Half Japanese
|
75
|
+
# title : 1,000,000,000 Kisses
|
76
|
+
# albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
|
77
|
+
# label : Stillwater/Fire
|
78
|
+
#
|
79
|
+
#
|
80
|
+
# (2) artist : Half Japanese,1,000,000
|
81
|
+
# title : 000 Kisses
|
82
|
+
# albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
|
83
|
+
# label : Stillwater/Fire
|
84
|
+
#
|
85
|
+
#
|
86
|
+
# (3) artist : Half Japanese,1
|
87
|
+
# title : 000,000,000 Kisses
|
88
|
+
# albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
|
89
|
+
# label : Stillwater/Fire
|
90
|
+
#
|
91
|
+
#
|
92
|
+
# (4) artist : Half Japanese,1,000
|
93
|
+
# title : 000,000 Kisses
|
94
|
+
# albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
|
95
|
+
# label : Stillwater/Fire
|
96
|
+
|
97
|
+
option.collect do |o|
|
98
|
+
result = o.to_s.scan(/\d{1,3}(?:,\d{1,3})*(?:\.\d+)?/)
|
99
|
+
if result.first && result.first.index(',')
|
100
|
+
result.first.size
|
101
|
+
else
|
102
|
+
0
|
103
|
+
end
|
104
|
+
end.max.to_i
|
105
|
+
end
|
106
|
+
|
107
|
+
private
|
108
|
+
|
109
|
+
def rules
|
110
|
+
methods.grep(/options_that/)
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
end
|
@@ -46,7 +46,7 @@ module CommaSplice
|
|
46
46
|
left_to_right_index = []
|
47
47
|
@header.split(',').size.times do |time|
|
48
48
|
left_to_right_index.push(@values.map do |value_line|
|
49
|
-
value_line.split(',')[time].size
|
49
|
+
value_line.split(',')[time].to_s.size
|
50
50
|
end.uniq.size == 1)
|
51
51
|
end
|
52
52
|
|
@@ -57,7 +57,7 @@ module CommaSplice
|
|
57
57
|
right_to_left_index = []
|
58
58
|
@header.split(',').size.times do |time|
|
59
59
|
right_to_left_index.unshift(@values.map do |value_line|
|
60
|
-
value_line.split(',')[-time].size
|
60
|
+
value_line.split(',')[-time].to_s.size
|
61
61
|
end.uniq.size == 1)
|
62
62
|
end
|
63
63
|
|
@@ -21,8 +21,12 @@ module CommaSplice
|
|
21
21
|
@values && @values.size > 0 && @headers.size != @values.size
|
22
22
|
end
|
23
23
|
|
24
|
+
def needs_manual_input?
|
25
|
+
corrector.needs_manual_input?
|
26
|
+
end
|
27
|
+
|
24
28
|
def original
|
25
|
-
@values
|
29
|
+
generate_csv_line(@values)
|
26
30
|
end
|
27
31
|
|
28
32
|
def corrected
|
@@ -37,11 +41,15 @@ module CommaSplice
|
|
37
41
|
|
38
42
|
values_before = values[0...left_bounds]
|
39
43
|
values_after = values.slice(right_bounds + 1, -(right_bounds + 1))
|
40
|
-
[values_before, corrector.correction, values_after].flatten
|
44
|
+
generate_csv_line([values_before, corrector.correction, values_after].flatten)
|
41
45
|
end
|
42
46
|
|
43
47
|
private
|
44
48
|
|
49
|
+
def generate_csv_line(values)
|
50
|
+
CSV.generate_line(values)
|
51
|
+
end
|
52
|
+
|
45
53
|
def corrector
|
46
54
|
CommaCalculator.new(selected_headers, selected_values)
|
47
55
|
end
|
data/lib/comma_splice/version.rb
CHANGED
data/lib/comma_splice.rb
CHANGED
@@ -8,6 +8,7 @@ require 'comma_splice/helpers/variable_column_finder'
|
|
8
8
|
require 'comma_splice/helpers/line'
|
9
9
|
require 'comma_splice/helpers/join_possibilities'
|
10
10
|
require 'comma_splice/helpers/comma_calculator'
|
11
|
+
require 'comma_splice/helpers/option_scorer'
|
11
12
|
|
12
13
|
require 'comma_splice/line_corrector'
|
13
14
|
require 'comma_splice/file_corrector'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: comma_splice
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeff Keen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-01-
|
11
|
+
date: 2020-01-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -120,6 +120,7 @@ files:
|
|
120
120
|
- lib/comma_splice/helpers/content_finder.rb
|
121
121
|
- lib/comma_splice/helpers/join_possibilities.rb
|
122
122
|
- lib/comma_splice/helpers/line.rb
|
123
|
+
- lib/comma_splice/helpers/option_scorer.rb
|
123
124
|
- lib/comma_splice/helpers/variable_column_finder.rb
|
124
125
|
- lib/comma_splice/line_corrector.rb
|
125
126
|
- lib/comma_splice/version.rb
|