comma_splice 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Gemfile +0 -1
- data/Gemfile.lock +1 -1
- data/bin/comma_splice +6 -1
- data/lib/comma_splice/file_corrector.rb +5 -1
- data/lib/comma_splice/helpers/comma_calculator.rb +42 -21
- data/lib/comma_splice/helpers/content_finder.rb +8 -6
- data/lib/comma_splice/helpers/option_scorer.rb +114 -0
- data/lib/comma_splice/helpers/variable_column_finder.rb +2 -2
- data/lib/comma_splice/line_corrector.rb +10 -2
- data/lib/comma_splice/version.rb +1 -1
- data/lib/comma_splice.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bd44bca5b615b5f267ae81b16aaa6b379b53963e637af1cc000d6229996963d4
|
4
|
+
data.tar.gz: bd70a25937f8baf3cec97eeb5276ce050f3898a76f5be9cf8e0c8e726bc5a8fa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 991f2138d4d08941b1a83d231338a5197bce6bcea0e136c3cbb67ca67cf04b31ad425ce148d48f2bf5ca8280bfee2b736d7ace002cd043dac071bdee8ca2ad2f
|
7
|
+
data.tar.gz: 4d9e601ab359fe6511d9c54b3469104e64f057953924a7867c6b7e58df224c9b5d9b279060ff7e67e3d965da7460334ed23f97c1fd2792c32441e2cc9f8161da
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
### 0.2 (January 27, 2020)
|
4
|
+
- [IMPROVEMENT] Add scoring model to better handle cases that needed prompting before, like comma-separated numbers
|
5
|
+
- [IMPROVEMENT] Correct line escaping even on lines that don't have incorrect commas to ensure correct parsing of generated CSV down the line
|
6
|
+
- [IMPROVEMENT] Use ruby csv library to generate lines instead of handling escaping cases manually
|
7
|
+
|
3
8
|
### 0.1.1 (January 24, 2020)
|
4
9
|
- [BUGFIX] handle case where all columns are equal widths
|
5
10
|
- [BUGFIX] Improve error message
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
data/bin/comma_splice
CHANGED
@@ -8,7 +8,12 @@ class CommaSpliceCLI < Thor
|
|
8
8
|
class_option :start_line, type: :numeric, default: nil
|
9
9
|
class_option :end_line, type: :numeric, default: nil
|
10
10
|
|
11
|
-
desc '
|
11
|
+
desc 'version', 'print the current comma_splice version'
|
12
|
+
def version
|
13
|
+
puts CommaSplice::VERSION
|
14
|
+
end
|
15
|
+
|
16
|
+
desc 'correct FILE_PATH', 'return corrected file contents'
|
12
17
|
def correct(file_path)
|
13
18
|
file_corrector = CommaSplice::FileCorrector.new(
|
14
19
|
file_path,
|
@@ -9,7 +9,7 @@ module CommaSplice
|
|
9
9
|
@content_finder = ContentFinder.new(@file_contents, start_line, end_line)
|
10
10
|
@csv_content = @content_finder.content
|
11
11
|
@start_line = @content_finder.start_line
|
12
|
-
@end_line = @content_finder.
|
12
|
+
@end_line = @content_finder.end_line
|
13
13
|
|
14
14
|
if start_column && end_column
|
15
15
|
@start_column = start_column
|
@@ -35,6 +35,10 @@ module CommaSplice
|
|
35
35
|
bad_lines.size.positive?
|
36
36
|
end
|
37
37
|
|
38
|
+
def needs_manual_input?
|
39
|
+
line_correctors.any?(&:needs_manual_input?)
|
40
|
+
end
|
41
|
+
|
38
42
|
def corrected
|
39
43
|
@corrected ||= [
|
40
44
|
@file_contents.lines[0, @start_line],
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module CommaSplice
|
2
3
|
# provide an array of CSV headers and and array of CSV values
|
3
4
|
# and this will figure out the best correction and prompt
|
@@ -5,17 +6,17 @@ module CommaSplice
|
|
5
6
|
|
6
7
|
class CommaCalculator
|
7
8
|
def initialize(headers, values)
|
9
|
+
raise StandardError, "Determining all the possibilities to fit #{values.size} values into the #{headers.size} headers #{headers.inspect} is computationally expensive. Please specify the columns where commas might be." if headers.size > 10 && values.size > 10
|
10
|
+
|
8
11
|
@headers = headers
|
9
12
|
@values = values
|
10
|
-
|
11
|
-
raise StandardError, "Determining all the possibilities to fit #{@values.size} values into the #{@headers.size} headers #{@headers.inspect} is computationally expensive. Please specify the columns where commas might be." if @headers.size > 10 && @values.size > 10
|
12
13
|
end
|
13
14
|
|
14
15
|
def correction
|
15
|
-
if @headers.size
|
16
|
+
if @headers.size == @values.size
|
16
17
|
@values
|
17
18
|
elsif best_options.size == 1
|
18
|
-
best_options.first
|
19
|
+
best_options.first.option
|
19
20
|
elsif best_options.size > 1
|
20
21
|
prompt_for_options(best_options)
|
21
22
|
else
|
@@ -24,12 +25,14 @@ module CommaSplice
|
|
24
25
|
end
|
25
26
|
|
26
27
|
def all_options
|
27
|
-
|
28
|
+
@all_options ||= join_possibilities.collect do |joins|
|
28
29
|
values = @values.dup
|
29
30
|
joins.collect do |join_num|
|
30
31
|
val = values.shift(join_num)
|
31
|
-
if val.
|
32
|
-
|
32
|
+
if val.empty?
|
33
|
+
nil
|
34
|
+
elsif val.size == 1
|
35
|
+
val.first
|
33
36
|
else
|
34
37
|
val.join(',')
|
35
38
|
end
|
@@ -37,14 +40,23 @@ module CommaSplice
|
|
37
40
|
end
|
38
41
|
end
|
39
42
|
|
40
|
-
def
|
41
|
-
all_options.
|
42
|
-
|
43
|
+
def ranked_options
|
44
|
+
@ranked_options ||= all_options.collect do |option|
|
45
|
+
OptionScorer.new(option)
|
43
46
|
end
|
44
47
|
end
|
45
48
|
|
46
|
-
def
|
47
|
-
|
49
|
+
def score_option(option)
|
50
|
+
OptionScorer.new(option).score
|
51
|
+
end
|
52
|
+
|
53
|
+
def best_options
|
54
|
+
max_score = ranked_options.collect { |o| o.score }.max
|
55
|
+
ranked_options.select { |o| o.score == max_score }
|
56
|
+
end
|
57
|
+
|
58
|
+
def needs_manual_input?
|
59
|
+
!best_options.one?
|
48
60
|
end
|
49
61
|
|
50
62
|
def needs_correcting?
|
@@ -53,10 +65,6 @@ module CommaSplice
|
|
53
65
|
|
54
66
|
private
|
55
67
|
|
56
|
-
def quoted_values(values)
|
57
|
-
"\"#{values.join(',').gsub(/(?<!")(?:"{2})*\K\"/, '""')}\"" # escape a double quote if it hasn't been escaped already
|
58
|
-
end
|
59
|
-
|
60
68
|
def join_possibilities
|
61
69
|
JoinPossibilities.new(@values.size, @headers.size).possibilities
|
62
70
|
end
|
@@ -66,21 +74,34 @@ module CommaSplice
|
|
66
74
|
|
67
75
|
options.each_with_index do |option, index|
|
68
76
|
@headers.each_with_index do |header, i|
|
69
|
-
marker = i.zero?
|
70
|
-
|
77
|
+
marker = if i.zero?
|
78
|
+
"(#{index + 1})"
|
79
|
+
else
|
80
|
+
''
|
81
|
+
end
|
82
|
+
|
83
|
+
puts marker.ljust(7) +
|
71
84
|
header.ljust(longest_header.size) + ': ' +
|
72
|
-
option[i]
|
85
|
+
option.option[i].to_s
|
73
86
|
end
|
87
|
+
puts ''.ljust(7) + "(score = #{option.score})"
|
88
|
+
puts ''.ljust(7) + option.breakdown
|
74
89
|
puts "\n"
|
75
90
|
end
|
76
91
|
|
92
|
+
puts "press 0 to see all options" if all_options.size != options.size
|
93
|
+
|
77
94
|
selected_option = nil
|
78
|
-
until selected_option && selected_option.to_i >
|
95
|
+
until selected_option && selected_option.to_i > -1
|
79
96
|
puts 'which one is correct?'
|
80
97
|
selected_option = STDIN.gets
|
81
98
|
end
|
82
99
|
|
83
|
-
|
100
|
+
if selected_option.to_i == 0
|
101
|
+
prompt_for_options(ranked_options.sort_by { |s| s.score.to_i }.reverse)
|
102
|
+
else
|
103
|
+
options[selected_option.to_i - 1].option
|
104
|
+
end
|
84
105
|
end
|
85
106
|
end
|
86
107
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module CommaSplice
|
2
4
|
# Given a file this will find the CSV content. Some files have some non-csv junk at the top
|
3
5
|
|
@@ -26,11 +28,11 @@ module CommaSplice
|
|
26
28
|
Line.new(line).values.size < 2
|
27
29
|
end
|
28
30
|
|
29
|
-
if relative_end_line
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
31
|
+
@end_line = if relative_end_line
|
32
|
+
@start_line + relative_end_line - 1
|
33
|
+
else
|
34
|
+
-1
|
35
|
+
end
|
34
36
|
|
35
37
|
@content = @file_contents.lines[@start_line..@end_line]
|
36
38
|
end
|
@@ -38,7 +40,7 @@ module CommaSplice
|
|
38
40
|
def parsed
|
39
41
|
quote_chars = %w[" | ~ ^ & *]
|
40
42
|
begin
|
41
|
-
CSV.parse(@content.join(
|
43
|
+
CSV.parse(@content.join("\n"), quote_char: quote_chars.shift, headers: :first_row, liberal_parsing: true)
|
42
44
|
rescue CSV::MalformedCSVError
|
43
45
|
quote_chars.empty? ? raise : retry
|
44
46
|
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
module CommaSplice
|
2
|
+
# scores options based on how likely they are to be correct
|
3
|
+
|
4
|
+
class OptionScorer
|
5
|
+
attr_reader :option
|
6
|
+
|
7
|
+
def initialize(option)
|
8
|
+
@option = option
|
9
|
+
@start_score = 100
|
10
|
+
end
|
11
|
+
|
12
|
+
def breakdown
|
13
|
+
score = @start_score
|
14
|
+
breakdown = []
|
15
|
+
|
16
|
+
rules.each do |rule|
|
17
|
+
rule_score = send(rule.to_sym)
|
18
|
+
score += rule_score
|
19
|
+
if rule_score != 0
|
20
|
+
breakdown << "#{rule_score.to_s.ljust(3)} #{rule.to_sym}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
breakdown.join("\n")
|
25
|
+
end
|
26
|
+
|
27
|
+
def score
|
28
|
+
score = @start_score
|
29
|
+
rules.each do |rule|
|
30
|
+
score += send(rule.to_sym)
|
31
|
+
end
|
32
|
+
score
|
33
|
+
end
|
34
|
+
|
35
|
+
def options_that_start_with_a_space
|
36
|
+
option.select do |o|
|
37
|
+
o.to_s.starts_with?(' ')
|
38
|
+
end.size * -10
|
39
|
+
end
|
40
|
+
|
41
|
+
def options_that_start_with_a_quote_followed_by_a_space
|
42
|
+
option.select do |o|
|
43
|
+
o.to_s.starts_with?('" ')
|
44
|
+
end.size * -1
|
45
|
+
end
|
46
|
+
|
47
|
+
def options_that_start_with_a_comma
|
48
|
+
option.select do |o|
|
49
|
+
o.to_s.starts_with?(',')
|
50
|
+
end.size * -5
|
51
|
+
end
|
52
|
+
|
53
|
+
def options_that_end_with_a_comma
|
54
|
+
option.select do |o|
|
55
|
+
o.to_s.ends_with?(',')
|
56
|
+
end.size * -5
|
57
|
+
end
|
58
|
+
|
59
|
+
def options_that_have_words_joined_by_commas
|
60
|
+
option.select do |o|
|
61
|
+
o.to_s.match(/[A-Za-z],[A-Za-z]/)
|
62
|
+
end.compact.size * -5
|
63
|
+
end
|
64
|
+
|
65
|
+
def options_that_are_blank
|
66
|
+
option.select do |o|
|
67
|
+
o.to_s.strip.blank?
|
68
|
+
end.size * -5
|
69
|
+
end
|
70
|
+
|
71
|
+
def options_that_have_longest_comma_separated_number
|
72
|
+
# favor items that have a longer comma separated number
|
73
|
+
# i.e in the following example, option 1 should win
|
74
|
+
# (1) artist : Half Japanese
|
75
|
+
# title : 1,000,000,000 Kisses
|
76
|
+
# albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
|
77
|
+
# label : Stillwater/Fire
|
78
|
+
#
|
79
|
+
#
|
80
|
+
# (2) artist : Half Japanese,1,000,000
|
81
|
+
# title : 000 Kisses
|
82
|
+
# albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
|
83
|
+
# label : Stillwater/Fire
|
84
|
+
#
|
85
|
+
#
|
86
|
+
# (3) artist : Half Japanese,1
|
87
|
+
# title : 000,000,000 Kisses
|
88
|
+
# albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
|
89
|
+
# label : Stillwater/Fire
|
90
|
+
#
|
91
|
+
#
|
92
|
+
# (4) artist : Half Japanese,1,000
|
93
|
+
# title : 000,000 Kisses
|
94
|
+
# albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
|
95
|
+
# label : Stillwater/Fire
|
96
|
+
|
97
|
+
option.collect do |o|
|
98
|
+
result = o.to_s.scan(/\d{1,3}(?:,\d{1,3})*(?:\.\d+)?/)
|
99
|
+
if result.first && result.first.index(',')
|
100
|
+
result.first.size
|
101
|
+
else
|
102
|
+
0
|
103
|
+
end
|
104
|
+
end.max.to_i
|
105
|
+
end
|
106
|
+
|
107
|
+
private
|
108
|
+
|
109
|
+
def rules
|
110
|
+
methods.grep(/options_that/)
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
end
|
@@ -46,7 +46,7 @@ module CommaSplice
|
|
46
46
|
left_to_right_index = []
|
47
47
|
@header.split(',').size.times do |time|
|
48
48
|
left_to_right_index.push(@values.map do |value_line|
|
49
|
-
value_line.split(',')[time].size
|
49
|
+
value_line.split(',')[time].to_s.size
|
50
50
|
end.uniq.size == 1)
|
51
51
|
end
|
52
52
|
|
@@ -57,7 +57,7 @@ module CommaSplice
|
|
57
57
|
right_to_left_index = []
|
58
58
|
@header.split(',').size.times do |time|
|
59
59
|
right_to_left_index.unshift(@values.map do |value_line|
|
60
|
-
value_line.split(',')[-time].size
|
60
|
+
value_line.split(',')[-time].to_s.size
|
61
61
|
end.uniq.size == 1)
|
62
62
|
end
|
63
63
|
|
@@ -21,8 +21,12 @@ module CommaSplice
|
|
21
21
|
@values && @values.size > 0 && @headers.size != @values.size
|
22
22
|
end
|
23
23
|
|
24
|
+
def needs_manual_input?
|
25
|
+
corrector.needs_manual_input?
|
26
|
+
end
|
27
|
+
|
24
28
|
def original
|
25
|
-
@values
|
29
|
+
generate_csv_line(@values)
|
26
30
|
end
|
27
31
|
|
28
32
|
def corrected
|
@@ -37,11 +41,15 @@ module CommaSplice
|
|
37
41
|
|
38
42
|
values_before = values[0...left_bounds]
|
39
43
|
values_after = values.slice(right_bounds + 1, -(right_bounds + 1))
|
40
|
-
[values_before, corrector.correction, values_after].flatten
|
44
|
+
generate_csv_line([values_before, corrector.correction, values_after].flatten)
|
41
45
|
end
|
42
46
|
|
43
47
|
private
|
44
48
|
|
49
|
+
def generate_csv_line(values)
|
50
|
+
CSV.generate_line(values)
|
51
|
+
end
|
52
|
+
|
45
53
|
def corrector
|
46
54
|
CommaCalculator.new(selected_headers, selected_values)
|
47
55
|
end
|
data/lib/comma_splice/version.rb
CHANGED
data/lib/comma_splice.rb
CHANGED
@@ -8,6 +8,7 @@ require 'comma_splice/helpers/variable_column_finder'
|
|
8
8
|
require 'comma_splice/helpers/line'
|
9
9
|
require 'comma_splice/helpers/join_possibilities'
|
10
10
|
require 'comma_splice/helpers/comma_calculator'
|
11
|
+
require 'comma_splice/helpers/option_scorer'
|
11
12
|
|
12
13
|
require 'comma_splice/line_corrector'
|
13
14
|
require 'comma_splice/file_corrector'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: comma_splice
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeff Keen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-01-
|
11
|
+
date: 2020-01-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -120,6 +120,7 @@ files:
|
|
120
120
|
- lib/comma_splice/helpers/content_finder.rb
|
121
121
|
- lib/comma_splice/helpers/join_possibilities.rb
|
122
122
|
- lib/comma_splice/helpers/line.rb
|
123
|
+
- lib/comma_splice/helpers/option_scorer.rb
|
123
124
|
- lib/comma_splice/helpers/variable_column_finder.rb
|
124
125
|
- lib/comma_splice/line_corrector.rb
|
125
126
|
- lib/comma_splice/version.rb
|