comma_splice 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 74958cacc824b9e475df20acee4cc75ac354738ab4ed475a7ddc5f3ce5c7c553
4
- data.tar.gz: 3b7129b58e24be7e9549e2971b994fd159e0d21675118e3c413a38dc1823db9c
3
+ metadata.gz: bd44bca5b615b5f267ae81b16aaa6b379b53963e637af1cc000d6229996963d4
4
+ data.tar.gz: bd70a25937f8baf3cec97eeb5276ce050f3898a76f5be9cf8e0c8e726bc5a8fa
5
5
  SHA512:
6
- metadata.gz: 112fea4670275a7222bbf60d13c8f61cea9386a60eaaf2cf16fd514f5c0ffd69a4e2e0453658408977607608340ca1e94e7f53581565d10014501c18d61a6570
7
- data.tar.gz: b62a9835b22555efd4aad03c467c1d24f298d50c34b399171f8fc5bcb0d8c1ddfff1be14268599fe5ce67b88c6b9201c6c3e250e72de34361ba252594a6146e1
6
+ metadata.gz: 991f2138d4d08941b1a83d231338a5197bce6bcea0e136c3cbb67ca67cf04b31ad425ce148d48f2bf5ca8280bfee2b736d7ace002cd043dac071bdee8ca2ad2f
7
+ data.tar.gz: 4d9e601ab359fe6511d9c54b3469104e64f057953924a7867c6b7e58df224c9b5d9b279060ff7e67e3d965da7460334ed23f97c1fd2792c32441e2cc9f8161da
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # Changelog
2
2
 
3
+ ### 0.2 (January 27, 2020)
4
+ - [IMPROVEMENT] Add scoring model to better handle cases that needed prompting before, like comma-separated numbers
5
+ - [IMPROVEMENT] Correct line escaping even on lines that don't have incorrect commas to ensure correct parsing of generated CSV down the line
6
+ - [IMPROVEMENT] Use ruby csv library to generate lines instead of handling escaping cases manually
7
+
3
8
  ### 0.1.1 (January 24, 2020)
4
9
  - [BUGFIX] handle case where all columns are equal widths
5
10
  - [BUGFIX] Improve error message
data/Gemfile CHANGED
@@ -1,4 +1,3 @@
1
1
  source "https://rubygems.org"
2
-
3
2
  # Specify your gem's dependencies in comma_splice.gemspec
4
3
  gemspec
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- comma_splice (0.1.1)
4
+ comma_splice (0.2.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/bin/comma_splice CHANGED
@@ -8,7 +8,12 @@ class CommaSpliceCLI < Thor
8
8
  class_option :start_line, type: :numeric, default: nil
9
9
  class_option :end_line, type: :numeric, default: nil
10
10
 
11
- desc 'fix FILE_PATH [SAVE_PATH]', 'return corrected file contents'
11
+ desc 'version', 'print the current comma_splice version'
12
+ def version
13
+ puts CommaSplice::VERSION
14
+ end
15
+
16
+ desc 'correct FILE_PATH', 'return corrected file contents'
12
17
  def correct(file_path)
13
18
  file_corrector = CommaSplice::FileCorrector.new(
14
19
  file_path,
@@ -9,7 +9,7 @@ module CommaSplice
9
9
  @content_finder = ContentFinder.new(@file_contents, start_line, end_line)
10
10
  @csv_content = @content_finder.content
11
11
  @start_line = @content_finder.start_line
12
- @end_line = @content_finder.start_line
12
+ @end_line = @content_finder.end_line
13
13
 
14
14
  if start_column && end_column
15
15
  @start_column = start_column
@@ -35,6 +35,10 @@ module CommaSplice
35
35
  bad_lines.size.positive?
36
36
  end
37
37
 
38
+ def needs_manual_input?
39
+ line_correctors.any?(&:needs_manual_input?)
40
+ end
41
+
38
42
  def corrected
39
43
  @corrected ||= [
40
44
  @file_contents.lines[0, @start_line],
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module CommaSplice
2
3
  # provide an array of CSV headers and and array of CSV values
3
4
  # and this will figure out the best correction and prompt
@@ -5,17 +6,17 @@ module CommaSplice
5
6
 
6
7
  class CommaCalculator
7
8
  def initialize(headers, values)
9
+ raise StandardError, "Determining all the possibilities to fit #{values.size} values into the #{headers.size} headers #{headers.inspect} is computationally expensive. Please specify the columns where commas might be." if headers.size > 10 && values.size > 10
10
+
8
11
  @headers = headers
9
12
  @values = values
10
-
11
- raise StandardError, "Determining all the possibilities to fit #{@values.size} values into the #{@headers.size} headers #{@headers.inspect} is computationally expensive. Please specify the columns where commas might be." if @headers.size > 10 && @values.size > 10
12
13
  end
13
14
 
14
15
  def correction
15
- if @headers.size === @values.size
16
+ if @headers.size == @values.size
16
17
  @values
17
18
  elsif best_options.size == 1
18
- best_options.first
19
+ best_options.first.option
19
20
  elsif best_options.size > 1
20
21
  prompt_for_options(best_options)
21
22
  else
@@ -24,12 +25,14 @@ module CommaSplice
24
25
  end
25
26
 
26
27
  def all_options
27
- options = join_possibilities.collect do |joins|
28
+ @all_options ||= join_possibilities.collect do |joins|
28
29
  values = @values.dup
29
30
  joins.collect do |join_num|
30
31
  val = values.shift(join_num)
31
- if val.size > 1
32
- quoted_values(val)
32
+ if val.empty?
33
+ nil
34
+ elsif val.size == 1
35
+ val.first
33
36
  else
34
37
  val.join(',')
35
38
  end
@@ -37,14 +40,23 @@ module CommaSplice
37
40
  end
38
41
  end
39
42
 
40
- def best_options
41
- all_options.select do |option|
42
- option.none? { |o| o.starts_with?(' ') || o.starts_with?('" ') }
43
+ def ranked_options
44
+ @ranked_options ||= all_options.collect do |option|
45
+ OptionScorer.new(option)
43
46
  end
44
47
  end
45
48
 
46
- def requires_manual_input?
47
- needs_correcting? && best_options.many?
49
+ def score_option(option)
50
+ OptionScorer.new(option).score
51
+ end
52
+
53
+ def best_options
54
+ max_score = ranked_options.collect { |o| o.score }.max
55
+ ranked_options.select { |o| o.score == max_score }
56
+ end
57
+
58
+ def needs_manual_input?
59
+ !best_options.one?
48
60
  end
49
61
 
50
62
  def needs_correcting?
@@ -53,10 +65,6 @@ module CommaSplice
53
65
 
54
66
  private
55
67
 
56
- def quoted_values(values)
57
- "\"#{values.join(',').gsub(/(?<!")(?:"{2})*\K\"/, '""')}\"" # escape a double quote if it hasn't been escaped already
58
- end
59
-
60
68
  def join_possibilities
61
69
  JoinPossibilities.new(@values.size, @headers.size).possibilities
62
70
  end
@@ -66,21 +74,34 @@ module CommaSplice
66
74
 
67
75
  options.each_with_index do |option, index|
68
76
  @headers.each_with_index do |header, i|
69
- marker = i.zero? ? "(#{index + 1})" : ''
70
- puts marker.ljust(5) +
77
+ marker = if i.zero?
78
+ "(#{index + 1})"
79
+ else
80
+ ''
81
+ end
82
+
83
+ puts marker.ljust(7) +
71
84
  header.ljust(longest_header.size) + ': ' +
72
- option[i]
85
+ option.option[i].to_s
73
86
  end
87
+ puts ''.ljust(7) + "(score = #{option.score})"
88
+ puts ''.ljust(7) + option.breakdown
74
89
  puts "\n"
75
90
  end
76
91
 
92
+ puts "press 0 to see all options" if all_options.size != options.size
93
+
77
94
  selected_option = nil
78
- until selected_option && selected_option.to_i > 0
95
+ until selected_option && selected_option.to_i > -1
79
96
  puts 'which one is correct?'
80
97
  selected_option = STDIN.gets
81
98
  end
82
99
 
83
- options[selected_option.to_i - 1]
100
+ if selected_option.to_i == 0
101
+ prompt_for_options(ranked_options.sort_by { |s| s.score.to_i }.reverse)
102
+ else
103
+ options[selected_option.to_i - 1].option
104
+ end
84
105
  end
85
106
  end
86
107
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module CommaSplice
2
4
  # Given a file this will find the CSV content. Some files have some non-csv junk at the top
3
5
 
@@ -26,11 +28,11 @@ module CommaSplice
26
28
  Line.new(line).values.size < 2
27
29
  end
28
30
 
29
- if relative_end_line
30
- @end_line = @start_line + relative_end_line - 1
31
- else
32
- @end_line = -1
33
- end
31
+ @end_line = if relative_end_line
32
+ @start_line + relative_end_line - 1
33
+ else
34
+ -1
35
+ end
34
36
 
35
37
  @content = @file_contents.lines[@start_line..@end_line]
36
38
  end
@@ -38,7 +40,7 @@ module CommaSplice
38
40
  def parsed
39
41
  quote_chars = %w[" | ~ ^ & *]
40
42
  begin
41
- CSV.parse(@content.join('\n'), quote_char: quote_chars.shift, headers: :first_row, liberal_parsing: true)
43
+ CSV.parse(@content.join("\n"), quote_char: quote_chars.shift, headers: :first_row, liberal_parsing: true)
42
44
  rescue CSV::MalformedCSVError
43
45
  quote_chars.empty? ? raise : retry
44
46
  end
@@ -0,0 +1,114 @@
1
+ module CommaSplice
2
+ # scores options based on how likely they are to be correct
3
+
4
+ class OptionScorer
5
+ attr_reader :option
6
+
7
+ def initialize(option)
8
+ @option = option
9
+ @start_score = 100
10
+ end
11
+
12
+ def breakdown
13
+ score = @start_score
14
+ breakdown = []
15
+
16
+ rules.each do |rule|
17
+ rule_score = send(rule.to_sym)
18
+ score += rule_score
19
+ if rule_score != 0
20
+ breakdown << "#{rule_score.to_s.ljust(3)} #{rule.to_sym}"
21
+ end
22
+ end
23
+
24
+ breakdown.join("\n")
25
+ end
26
+
27
+ def score
28
+ score = @start_score
29
+ rules.each do |rule|
30
+ score += send(rule.to_sym)
31
+ end
32
+ score
33
+ end
34
+
35
+ def options_that_start_with_a_space
36
+ option.select do |o|
37
+ o.to_s.starts_with?(' ')
38
+ end.size * -10
39
+ end
40
+
41
+ def options_that_start_with_a_quote_followed_by_a_space
42
+ option.select do |o|
43
+ o.to_s.starts_with?('" ')
44
+ end.size * -1
45
+ end
46
+
47
+ def options_that_start_with_a_comma
48
+ option.select do |o|
49
+ o.to_s.starts_with?(',')
50
+ end.size * -5
51
+ end
52
+
53
+ def options_that_end_with_a_comma
54
+ option.select do |o|
55
+ o.to_s.ends_with?(',')
56
+ end.size * -5
57
+ end
58
+
59
+ def options_that_have_words_joined_by_commas
60
+ option.select do |o|
61
+ o.to_s.match(/[A-Za-z],[A-Za-z]/)
62
+ end.compact.size * -5
63
+ end
64
+
65
+ def options_that_are_blank
66
+ option.select do |o|
67
+ o.to_s.strip.blank?
68
+ end.size * -5
69
+ end
70
+
71
+ def options_that_have_longest_comma_separated_number
72
+ # favor items that have a longer comma separated number
73
+ # i.e in the following example, option 1 should win
74
+ # (1) artist : Half Japanese
75
+ # title : 1,000,000,000 Kisses
76
+ # albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
77
+ # label : Stillwater/Fire
78
+ #
79
+ #
80
+ # (2) artist : Half Japanese,1,000,000
81
+ # title : 000 Kisses
82
+ # albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
83
+ # label : Stillwater/Fire
84
+ #
85
+ #
86
+ # (3) artist : Half Japanese,1
87
+ # title : 000,000,000 Kisses
88
+ # albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
89
+ # label : Stillwater/Fire
90
+ #
91
+ #
92
+ # (4) artist : Half Japanese,1,000
93
+ # title : 000,000 Kisses
94
+ # albumtitle: Beautiful Songs: The Best of Jad Fair & Half Japanese
95
+ # label : Stillwater/Fire
96
+
97
+ option.collect do |o|
98
+ result = o.to_s.scan(/\d{1,3}(?:,\d{1,3})*(?:\.\d+)?/)
99
+ if result.first && result.first.index(',')
100
+ result.first.size
101
+ else
102
+ 0
103
+ end
104
+ end.max.to_i
105
+ end
106
+
107
+ private
108
+
109
+ def rules
110
+ methods.grep(/options_that/)
111
+ end
112
+
113
+ end
114
+ end
@@ -46,7 +46,7 @@ module CommaSplice
46
46
  left_to_right_index = []
47
47
  @header.split(',').size.times do |time|
48
48
  left_to_right_index.push(@values.map do |value_line|
49
- value_line.split(',')[time].size
49
+ value_line.split(',')[time].to_s.size
50
50
  end.uniq.size == 1)
51
51
  end
52
52
 
@@ -57,7 +57,7 @@ module CommaSplice
57
57
  right_to_left_index = []
58
58
  @header.split(',').size.times do |time|
59
59
  right_to_left_index.unshift(@values.map do |value_line|
60
- value_line.split(',')[-time].size
60
+ value_line.split(',')[-time].to_s.size
61
61
  end.uniq.size == 1)
62
62
  end
63
63
 
@@ -21,8 +21,12 @@ module CommaSplice
21
21
  @values && @values.size > 0 && @headers.size != @values.size
22
22
  end
23
23
 
24
+ def needs_manual_input?
25
+ corrector.needs_manual_input?
26
+ end
27
+
24
28
  def original
25
- @values.join(',')
29
+ generate_csv_line(@values)
26
30
  end
27
31
 
28
32
  def corrected
@@ -37,11 +41,15 @@ module CommaSplice
37
41
 
38
42
  values_before = values[0...left_bounds]
39
43
  values_after = values.slice(right_bounds + 1, -(right_bounds + 1))
40
- [values_before, corrector.correction, values_after].flatten.join(',')
44
+ generate_csv_line([values_before, corrector.correction, values_after].flatten)
41
45
  end
42
46
 
43
47
  private
44
48
 
49
+ def generate_csv_line(values)
50
+ CSV.generate_line(values)
51
+ end
52
+
45
53
  def corrector
46
54
  CommaCalculator.new(selected_headers, selected_values)
47
55
  end
@@ -1,3 +1,3 @@
1
1
  module CommaSplice
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/comma_splice.rb CHANGED
@@ -8,6 +8,7 @@ require 'comma_splice/helpers/variable_column_finder'
8
8
  require 'comma_splice/helpers/line'
9
9
  require 'comma_splice/helpers/join_possibilities'
10
10
  require 'comma_splice/helpers/comma_calculator'
11
+ require 'comma_splice/helpers/option_scorer'
11
12
 
12
13
  require 'comma_splice/line_corrector'
13
14
  require 'comma_splice/file_corrector'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: comma_splice
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeff Keen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-01-24 00:00:00.000000000 Z
11
+ date: 2020-01-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -120,6 +120,7 @@ files:
120
120
  - lib/comma_splice/helpers/content_finder.rb
121
121
  - lib/comma_splice/helpers/join_possibilities.rb
122
122
  - lib/comma_splice/helpers/line.rb
123
+ - lib/comma_splice/helpers/option_scorer.rb
123
124
  - lib/comma_splice/helpers/variable_column_finder.rb
124
125
  - lib/comma_splice/line_corrector.rb
125
126
  - lib/comma_splice/version.rb