zarby 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/lib/zarby/csv.rb +10 -0
- data/lib/zarby/normalize.rb +11 -24
- data/lib/zarby/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c981e3c42a22e8cc916f44d4f4cef1b2f64029a70dfdce1d9e07a0273a48e6d8
|
4
|
+
data.tar.gz: 3925cdb2dba86f5c9b754380753d9ac0607886099c2e21480d85de620e09d166
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d19251edbcf44a06d2ec737a1f9d062a1d5ed8856cea22da0327b09d137aefea1b8974477d5c6834c4f46b271cfab5acb23022dabcef2d6570ae38175b2ae53a
|
7
|
+
data.tar.gz: 46720efbdd8d8bedc2edc8b912b68e45f19d2cd876603ed3737cce9f06a7525771a676af9cc07e536bcbb557e638c07296d5fe4e4a3b804d7fd83acab89f3b94
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,20 @@
|
|
1
|
+
# 0.1.5 / 2023-10-24
|
2
|
+
|
3
|
+
## Enhancements
|
4
|
+
|
5
|
+
* Refactoring method utf8 to Zarby::Csv class
|
6
|
+
* Add documentation to Zarby::Csv module and Zarby::Normalize module
|
7
|
+
|
8
|
+
# 0.1.4 / 2023-10-24
|
9
|
+
|
10
|
+
* Remove missing debug log
|
11
|
+
|
12
|
+
# 0.1.3 / 2023-10-24
|
13
|
+
|
14
|
+
## Enhancements
|
15
|
+
|
16
|
+
* Decode ASCII-8BIT (actually windows)
|
17
|
+
|
1
18
|
# 0.1.2 / 2023-10-20
|
2
19
|
|
3
20
|
## Enhancements
|
data/lib/zarby/csv.rb
CHANGED
@@ -3,23 +3,30 @@
|
|
3
3
|
module Zarby
|
4
4
|
class NoColSepDetected < StandardError; end
|
5
5
|
|
6
|
+
# this class is used to detect the column separator in a CSV file
|
6
7
|
class Csv
|
7
8
|
COMMON_DELIMITERS = ['","', '";"', '":"', '"|"'].freeze
|
8
9
|
|
10
|
+
# @param [String] content
|
11
|
+
# @return [Csv]
|
9
12
|
def initialize(content:)
|
10
13
|
@content = content || ""
|
11
14
|
end
|
12
15
|
|
16
|
+
# @param [String] content
|
17
|
+
# @return [String]
|
13
18
|
def self.detect_separator(content)
|
14
19
|
new(content: content).detect_separator
|
15
20
|
end
|
16
21
|
|
22
|
+
# @return [String]
|
17
23
|
def detect_separator
|
18
24
|
valid? ? delimiters[0][0][1] : raise(Zarby::NoColSepDetected)
|
19
25
|
end
|
20
26
|
|
21
27
|
private
|
22
28
|
|
29
|
+
# @return [Boolean]
|
23
30
|
def valid?
|
24
31
|
!delimiters.collect(&:last).reduce(:+).zero?
|
25
32
|
end
|
@@ -28,14 +35,17 @@ module Zarby
|
|
28
35
|
# delimiters[0] #=> ["\";\"", 54]
|
29
36
|
# delimiters[0][0] #=> "\",\""
|
30
37
|
# delimiters[0][0][1] #=> ";"
|
38
|
+
# @return [Array<Array<String, Integer>>]
|
31
39
|
def delimiters
|
32
40
|
@delimiters ||= COMMON_DELIMITERS.inject({}, &count).sort(&most_found)
|
33
41
|
end
|
34
42
|
|
43
|
+
# @return [Proc]
|
35
44
|
def most_found
|
36
45
|
->(a, b) { b[1] <=> a[1] }
|
37
46
|
end
|
38
47
|
|
48
|
+
# @return [Proc]
|
39
49
|
def count
|
40
50
|
lambda { |hash, delimiter|
|
41
51
|
hash[delimiter] = @content.count(delimiter)
|
data/lib/zarby/normalize.rb
CHANGED
@@ -1,48 +1,35 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Zarby
|
4
|
+
# this class is used to normalize the input string to UTF-8
|
4
5
|
class Normalize
|
5
6
|
# utf-8 converting from the string's given encoding
|
6
|
-
COMMON_ENCODINGS = %w[UTF-8 Windows-1252 ASCII-8BIT US-ASCII].freeze
|
7
|
+
COMMON_ENCODINGS = %w[UTF-8 Windows-1252 ASCII-8BIT ISO-8859-1 US-ASCII].freeze
|
7
8
|
|
9
|
+
# @param input [String]
|
10
|
+
# @return [String]
|
8
11
|
def initialize(input:)
|
9
|
-
@input = input ||
|
12
|
+
@input = input || ''
|
10
13
|
end
|
11
14
|
|
15
|
+
# @param input [String]
|
16
|
+
# @return [String]
|
12
17
|
def self.utf8(input)
|
13
18
|
new(input: input).utf8
|
14
19
|
end
|
15
20
|
|
21
|
+
# @return [String]
|
16
22
|
def utf8
|
17
23
|
output = @input if valid?
|
18
|
-
COMMON_ENCODINGS.each do |encoding|
|
19
|
-
output ||= convert { @input.encode(encoding) }
|
20
|
-
output ||= convert { @input.force_encoding('UTF-8') } if encoding == 'UTF-8'
|
21
|
-
end
|
22
24
|
|
23
|
-
output ||=
|
24
|
-
|
25
|
-
# replace any unknown characters with a placeholder: �
|
26
|
-
output ||= convert { @input.encode('UTF-8', invalid: :replace, undef: :replace) }
|
27
|
-
output
|
28
|
-
end
|
29
|
-
|
30
|
-
private
|
31
|
-
|
32
|
-
def convert
|
33
|
-
string = yield
|
34
|
-
string if string.valid_encoding?
|
25
|
+
output ||= @input.force_encoding(Encoding::ISO_8859_1).encode!(Encoding::UTF_8)
|
35
26
|
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
36
27
|
nil
|
37
28
|
end
|
38
29
|
|
39
|
-
|
40
|
-
string = yield
|
41
|
-
string if string.valid_encoding?
|
42
|
-
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
43
|
-
nil
|
44
|
-
end
|
30
|
+
private
|
45
31
|
|
32
|
+
# @return [Boolean]
|
46
33
|
def valid?
|
47
34
|
@input.encoding.name == 'UTF-8' && @input.valid_encoding?
|
48
35
|
end
|
data/lib/zarby/version.rb
CHANGED