comma_splice 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 74958cacc824b9e475df20acee4cc75ac354738ab4ed475a7ddc5f3ce5c7c553
4
- data.tar.gz: 3b7129b58e24be7e9549e2971b994fd159e0d21675118e3c413a38dc1823db9c
3
+ metadata.gz: bd44bca5b615b5f267ae81b16aaa6b379b53963e637af1cc000d6229996963d4
4
+ data.tar.gz: bd70a25937f8baf3cec97eeb5276ce050f3898a76f5be9cf8e0c8e726bc5a8fa
5
5
  SHA512:
6
- metadata.gz: 112fea4670275a7222bbf60d13c8f61cea9386a60eaaf2cf16fd514f5c0ffd69a4e2e0453658408977607608340ca1e94e7f53581565d10014501c18d61a6570
7
- data.tar.gz: b62a9835b22555efd4aad03c467c1d24f298d50c34b399171f8fc5bcb0d8c1ddfff1be14268599fe5ce67b88c6b9201c6c3e250e72de34361ba252594a6146e1
6
+ metadata.gz: 991f2138d4d08941b1a83d231338a5197bce6bcea0e136c3cbb67ca67cf04b31ad425ce148d48f2bf5ca8280bfee2b736d7ace002cd043dac071bdee8ca2ad2f
7
+ data.tar.gz: 4d9e601ab359fe6511d9c54b3469104e64f057953924a7867c6b7e58df224c9b5d9b279060ff7e67e3d965da7460334ed23f97c1fd2792c32441e2cc9f8161da
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # Changelog
2
2
 
3
+ ### 0.2 (January 27, 2020)
4
+ - [IMPROVEMENT] Add scoring model to better handle cases that needed prompting before, like comma-separated numbers
5
+ - [IMPROVEMENT] Correct line escaping even on lines that don't have incorrect commas to ensure correct parsing of generated CSV down the line
6
+ - [IMPROVEMENT] Use ruby csv library to generate lines instead of handling escaping cases manually
7
+
3
8
  ### 0.1.1 (January 24, 2020)
4
9
  - [BUGFIX] handle case where all columns are equal widths
5
10
  - [BUGFIX] Improve error message
data/Gemfile CHANGED
@@ -1,4 +1,3 @@
1
1
  source "https://rubygems.org"
2
-
3
2
  # Specify your gem's dependencies in comma_splice.gemspec
4
3
  gemspec
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- comma_splice (0.1.1)
4
+ comma_splice (0.2.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/bin/comma_splice CHANGED
@@ -8,7 +8,12 @@ class CommaSpliceCLI < Thor
8
8
  class_option :start_line, type: :numeric, default: nil
9
9
  class_option :end_line, type: :numeric, default: nil
10
10
 
11
- desc 'fix FILE_PATH [SAVE_PATH]', 'return corrected file contents'
11
+ desc 'version', 'print the current comma_splice version'
12
+ def version
13
+ puts CommaSplice::VERSION
14
+ end
15
+
16
+ desc 'correct FILE_PATH', 'return corrected file contents'
12
17
  def correct(file_path)
13
18
  file_corrector = CommaSplice::FileCorrector.new(
14
19
  file_path,
@@ -9,7 +9,7 @@ module CommaSplice
9
9
  @content_finder = ContentFinder.new(@file_contents, start_line, end_line)
10
10
  @csv_content = @content_finder.content
11
11
  @start_line = @content_finder.start_line
12
- @end_line = @content_finder.start_line
12
+ @end_line = @content_finder.end_line
13
13
 
14
14
  if start_column && end_column
15
15
  @start_column = start_column
@@ -35,6 +35,10 @@ module CommaSplice
35
35
  bad_lines.size.positive?
36
36
  end
37
37
 
38
+ def needs_manual_input?
39
+ line_correctors.any?(&:needs_manual_input?)
40
+ end
41
+
38
42
  def corrected
39
43
  @corrected ||= [
40
44
  @file_contents.lines[0, @start_line],
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module CommaSplice
2
3
  # provide an array of CSV headers and and array of CSV values
3
4
  # and this will figure out the best correction and prompt
@@ -5,17 +6,17 @@ module CommaSplice
5
6
 
6
7
  class CommaCalculator
7
8
  def initialize(headers, values)
9
+ raise StandardError, "Determining all the possibilities to fit #{values.size} values into the #{headers.size} headers #{headers.inspect} is computationally expensive. Please specify the columns where commas might be." if headers.size > 10 && values.size > 10
10
+
8
11
  @headers = headers
9
12
  @values = values
10
-
11
- raise StandardError, "Determining all the possibilities to fit #{@values.size} values into the #{@headers.size} headers #{@headers.inspect} is computationally expensive. Please specify the columns where commas might be." if @headers.size > 10 && @values.size > 10
12
13
  end
13
14
 
14
15
  def correction
15
- if @headers.size === @values.size
16
+ if @headers.size == @values.size
16
17
  @values
17
18
  elsif best_options.size == 1
18
- best_options.first
19
+ best_options.first.option
19
20
  elsif best_options.size > 1
20
21
  prompt_for_options(best_options)
21
22
  else
@@ -24,12 +25,14 @@ module CommaSplice
24
25
  end
25
26
 
26
27
  def all_options
27
- options = join_possibilities.collect do |joins|
28
+ @all_options ||= join_possibilities.collect do |joins|
28
29
  values = @values.dup
29
30
  joins.collect do |join_num|
30
31
  val = values.shift(join_num)
31
- if val.size > 1
32
- quoted_values(val)
32
+ if val.empty?
33
+ nil
34
+ elsif val.size == 1
35
+ val.first
33
36
  else
34
37
  val.join(',')
35
38
  end
@@ -37,14 +40,23 @@ module CommaSplice
37
40
  end
38
41
  end
39
42
 
40
- def best_options
41
- all_options.select do |option|
42
- option.none? { |o| o.starts_with?(' ') || o.starts_with?('" ') }
43
+ def ranked_options
44
+ @ranked_options ||= all_options.collect do |option|
45
+ OptionScorer.new(option)
43
46
  end
44
47
  end
45
48
 
46
- def requires_manual_input?
47
- needs_correcting? && best_options.many?
49
+ def score_option(option)
50
+ OptionScorer.new(option).score
51
+ end
52
+
53
+ def best_options
54
+ max_score = ranked_options.collect { |o| o.score }.max
55
+ ranked_options.select { |o| o.score == max_score }
56
+ end
57
+
58
+ def needs_manual_input?
59
+ !best_options.one?
48
60
  end
49
61
 
50
62
  def needs_correcting?
@@ -53,10 +65,6 @@ module CommaSplice
53
65
 
54
66
  private
55
67
 
56
- def quoted_values(values)
57
- "\"#{values.join(',').gsub(/(?<!")(?:"{2})*\K\"/, '""')}\"" # escape a double quote if it hasn't been escaped already
58
- end
59
-
60
68
  def join_possibilities
61
69
  JoinPossibilities.new(@values.size, @headers.size).possibilities
62
70
  end
@@ -66,21 +74,34 @@ module CommaSplice
66
74
 
67
75
  options.each_with_index do |option, index|
68
76
  @headers.each_with_index do |header, i|
69
- marker = i.zero? ? "(#{index + 1})" : ''
70
- puts marker.ljust(5) +
77
+ marker = if i.zero?
78
+ "(#{index + 1})"
79
+ else
80
+ ''
81
+ end
82
+
83
+ puts marker.ljust(7) +
71
84
  header.ljust(longest_header.size) + ': ' +
72
- option[i]
85
+ option.option[i].to_s
73
86
  end
87
+ puts ''.ljust(7) + "(score = #{option.score})"
88
+ puts ''.ljust(7) + option.breakdown
74
89
  puts "\n"
75
90
  end
76
91
 
92
+ puts "press 0 to see all options" if all_options.size != options.size
93
+
77
94
  selected_option = nil
78
- until selected_option && selected_option.to_i > 0
95
+ until selected_option && selected_option.to_i > -1
79
96
  puts 'which one is correct?'
80
97
  selected_option = STDIN.gets
81
98
  end
82
99
 
83
- options[selected_option.to_i - 1]
100
+ if selected_option.to_i == 0
101
+ prompt_for_options(ranked_options.sort_by { |s| s.score.to_i }.reverse)
102
+ else
103
+ options[selected_option.to_i - 1].option
104
+ end
84
105
  end
85
106
  end
86
107
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module CommaSplice
2
4
  # Given a file this will find the CSV content. Some files have some non-csv junk at the top
3
5
 
@@ -26,11 +28,11 @@ module CommaSplice
26
28
  Line.new(line).values.size < 2
27
29
  end
28
30
 
29
- if relative_end_line
30
- @end_line = @start_line + relative_end_line - 1
31
- else
32
- @end_line = -1
33
- end
31
+ @end_line = if relative_end_line
32
+ @start_line + relative_end_line - 1
33
+ else
34
+ -1
35
+ end
34
36
 
35
37
  @content = @file_contents.lines[@start_line..@end_line]
36
38
  end
@@ -38,7 +40,7 @@ module CommaSplice
38
40
  def parsed
39
41
  quote_chars = %w[" | ~ ^ & *]
40
42
  begin
41
- CSV.parse(@content.join('\n'), quote_char: quote_chars.shift, headers: :first_row, liberal_parsing: true)
43
+ CSV.parse(@content.join("\n"), quote_char: quote_chars.shift, headers: :first_row, liberal_parsing: true)
42
44
  rescue CSV::MalformedCSVError
43
45
  quote_chars.empty? ? raise : retry
44
46
  end
@@ -0,0 +1,114 @@
1
+ module CommaSplice
2
+ # scores options based on how likely they are to be correct
3
+
4
+ class OptionScorer
5
+ attr_reader :option
6
+
7
+ def initialize(option)
8
+ @option = option
9
+ @start_score = 100
10
+ end
11
+
12
+ def breakdown
13
+ score = @start_score
14
+ breakdown = []
15
+
16
+ rules.each do |rule|
17
+ rule_score = send(rule.to_sym)
18
+ score += rule_score
19
+ if rule_score != 0
20
+ breakdown << "#{rule_score.to_s.ljust(3)} #{rule.to_sym}"
21
+ end
22
+ end
23
+
24
+ breakdown.join("\n")
25
+ end
26
+
27
+ def score
28
+ score = @start_score
29
+ rules.each do |rule|
30
+ score += send(rule.to_sym)
31
+ end
32
+ score
33
+ end
34
+
35
+ def options_that_start_with_a_space
36
+ option.select do |o|
37
+ o.to_s.starts_with?(' ')
38
+ end.size * -10
39
+ end
40
+
41
+ def options_that_start_with_a_quote_followed_by_a_space
42
+ option.select do |o|
43
+ o.to_s.starts_with?('" ')
44
+ end.size * -1
45
+ end
46
+
47
+ def options_that_start_with_a_comma
48
+ option.select do |o|
49
+ o.to_s.starts_with?(',')
50
+ end.size * -5
51
+ end
52
+
53
+ def options_that_end_with_a_comma
54
+ option.select do |o|
55
+ o.to_s.ends_with?(',')
56
+ end.size * -5
57
+ end
58
+
59
+ def options_that_have_words_joined_by_commas
60
+ option.select do |o|
61
+ o.to_s.match(/[A-Za-z],[A-Za-z]/)
62
+ end.compact.size * -5
63
+ end
64
+
65
+ def options_that_are_blank
66
+ option.select do |o|
67
+ o.to_s.strip.blank?
68
+ end.size * -5
69
+ end
70
+
71
+ def options_that_have_longest_comma_separated_number
72
+ # favor items that have a longer comma separated number
73
+ # i.e in the following example, option 1 should win
74
+ # (1) artist : Half Japanese
75
+ # title : 1,000,000,000 Kisses
76
+ # albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
77
+ # label : Stillwater/Fire
78
+ #
79
+ #
80
+ # (2) artist : Half Japanese,1,000,000
81
+ # title : 000 Kisses
82
+ # albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
83
+ # label : Stillwater/Fire
84
+ #
85
+ #
86
+ # (3) artist : Half Japanese,1
87
+ # title : 000,000,000 Kisses
88
+ # albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
89
+ # label : Stillwater/Fire
90
+ #
91
+ #
92
+ # (4) artist : Half Japanese,1,000
93
+ # title : 000,000 Kisses
94
+ # albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
95
+ # label : Stillwater/Fire
96
+
97
+ option.collect do |o|
98
+ result = o.to_s.scan(/\d{1,3}(?:,\d{1,3})*(?:\.\d+)?/)
99
+ if result.first && result.first.index(',')
100
+ result.first.size
101
+ else
102
+ 0
103
+ end
104
+ end.max.to_i
105
+ end
106
+
107
+ private
108
+
109
+ def rules
110
+ methods.grep(/options_that/)
111
+ end
112
+
113
+ end
114
+ end
@@ -46,7 +46,7 @@ module CommaSplice
46
46
  left_to_right_index = []
47
47
  @header.split(',').size.times do |time|
48
48
  left_to_right_index.push(@values.map do |value_line|
49
- value_line.split(',')[time].size
49
+ value_line.split(',')[time].to_s.size
50
50
  end.uniq.size == 1)
51
51
  end
52
52
 
@@ -57,7 +57,7 @@ module CommaSplice
57
57
  right_to_left_index = []
58
58
  @header.split(',').size.times do |time|
59
59
  right_to_left_index.unshift(@values.map do |value_line|
60
- value_line.split(',')[-time].size
60
+ value_line.split(',')[-time].to_s.size
61
61
  end.uniq.size == 1)
62
62
  end
63
63
 
@@ -21,8 +21,12 @@ module CommaSplice
21
21
  @values && @values.size > 0 && @headers.size != @values.size
22
22
  end
23
23
 
24
+ def needs_manual_input?
25
+ corrector.needs_manual_input?
26
+ end
27
+
24
28
  def original
25
- @values.join(',')
29
+ generate_csv_line(@values)
26
30
  end
27
31
 
28
32
  def corrected
@@ -37,11 +41,15 @@ module CommaSplice
37
41
 
38
42
  values_before = values[0...left_bounds]
39
43
  values_after = values.slice(right_bounds + 1, -(right_bounds + 1))
40
- [values_before, corrector.correction, values_after].flatten.join(',')
44
+ generate_csv_line([values_before, corrector.correction, values_after].flatten)
41
45
  end
42
46
 
43
47
  private
44
48
 
49
+ def generate_csv_line(values)
50
+ CSV.generate_line(values)
51
+ end
52
+
45
53
  def corrector
46
54
  CommaCalculator.new(selected_headers, selected_values)
47
55
  end
@@ -1,3 +1,3 @@
1
1
  module CommaSplice
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/comma_splice.rb CHANGED
@@ -8,6 +8,7 @@ require 'comma_splice/helpers/variable_column_finder'
8
8
  require 'comma_splice/helpers/line'
9
9
  require 'comma_splice/helpers/join_possibilities'
10
10
  require 'comma_splice/helpers/comma_calculator'
11
+ require 'comma_splice/helpers/option_scorer'
11
12
 
12
13
  require 'comma_splice/line_corrector'
13
14
  require 'comma_splice/file_corrector'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: comma_splice
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeff Keen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-01-24 00:00:00.000000000 Z
11
+ date: 2020-01-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -120,6 +120,7 @@ files:
120
120
  - lib/comma_splice/helpers/content_finder.rb
121
121
  - lib/comma_splice/helpers/join_possibilities.rb
122
122
  - lib/comma_splice/helpers/line.rb
123
+ - lib/comma_splice/helpers/option_scorer.rb
123
124
  - lib/comma_splice/helpers/variable_column_finder.rb
124
125
  - lib/comma_splice/line_corrector.rb
125
126
  - lib/comma_splice/version.rb