utf8_sanitizer 0.0.2.pre.rc.02 → 0.0.2.pre.rc.03
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +2 -1
- data/lib/utf8_sanitizer/seed.rb +2 -25
- data/lib/utf8_sanitizer/utf.rb +5 -9
- data/lib/utf8_sanitizer/version.rb +1 -1
- data/lib/utf8_sanitizer.rb +18 -4
- data/utf8_sanitizer.gemspec +2 -2
- metadata +3 -6
- data/lib/utf8_sanitizer/run.rb +0 -31
- data/utf8_sanitizer_gemspec_orig.txt +0 -36
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1128d1d63dfb289df84978713ef4fc7f8efc1bfb7d6784ab7accd85684824bff
|
4
|
+
data.tar.gz: 54111bdd0d5e3e1c6f97892c5404d68cbaf283068f77a3372e3bc6facb7cd9d3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0e64c64ab3dafaa19fdc1cac3fae947b8aafe909bde3d747b8a23b8516ad5224769ae690605074a22e85082275e799173e866d58963ef951b62d7183d9503ae7
|
7
|
+
data.tar.gz: '0813f8565e88f697799fd3882ed883595c89d5af7a6042a792b1918fe4e97524ed0ba6da5ac0e1e21b0c91dc7ef89766122000cdeffaaa08505fbfbb300a63c5'
|
data/Rakefile
CHANGED
@@ -13,6 +13,7 @@ task :console do
|
|
13
13
|
require 'utf8_sanitizer' # You know what to do.
|
14
14
|
require "active_support/all"
|
15
15
|
ARGV.clear
|
16
|
-
Utf8Sanitizer.
|
16
|
+
# sanitized_data = Utf8Sanitizer.sanitize(file_path: "./lib/utf8_sanitizer/csv/seeds_mini.csv")
|
17
|
+
sanitized_data = Utf8Sanitizer.sanitize
|
17
18
|
IRB.start
|
18
19
|
end
|
data/lib/utf8_sanitizer/seed.rb
CHANGED
@@ -19,11 +19,11 @@ module Utf8Sanitizer
|
|
19
19
|
|
20
20
|
def grab_seed_file_path
|
21
21
|
# "./lib/utf8_sanitizer/csv/seeds_clean.csv"
|
22
|
-
|
22
|
+
"./lib/utf8_sanitizer/csv/seeds_dirty.csv"
|
23
23
|
# "./lib/utf8_sanitizer/csv/seeds_mega.csv"
|
24
24
|
# "./lib/utf8_sanitizer/csv/seeds_mini.csv"
|
25
25
|
# "./lib/utf8_sanitizer/csv/seeds_mini_10.csv"
|
26
|
-
'./lib/utf8_sanitizer/csv/seeds_mini_2_bug.csv'
|
26
|
+
# './lib/utf8_sanitizer/csv/seeds_mini_2_bug.csv'
|
27
27
|
end
|
28
28
|
|
29
29
|
### Sample Hashes for validate_data
|
@@ -70,28 +70,5 @@ module Utf8Sanitizer
|
|
70
70
|
phone: '(757) 490-3651' }]
|
71
71
|
end
|
72
72
|
|
73
|
-
|
74
|
-
|
75
|
-
def grab_seed_web_criteria
|
76
|
-
pos_urls, neg_urls, neg_links, neg_hrefs, neg_exts = [], [], [], [], []
|
77
|
-
|
78
|
-
neg_urls = %w[approv avis budget collis eat enterprise facebook financ food google gourmet hertz hotel hyatt insur invest loan lube mobility motel motorola parts quick rent repair restaur rv ryder service softwar travel twitter webhost yellowpages yelp youtube]
|
79
|
-
|
80
|
-
pos_urls = ['acura', 'alfa romeo', 'aston martin', 'audi', 'bmw', 'bentley', 'bugatti', 'buick', 'cdjr', 'cadillac', 'chevrolet', 'chrysler', 'dodge', 'ferrari', 'fiat', 'ford', 'gmc', 'group', 'group', 'honda', 'hummer', 'hyundai', 'infiniti', 'isuzu', 'jaguar', 'jeep', 'kia', 'lamborghini', 'lexus', 'lincoln', 'lotus', 'mini', 'maserati', 'mazda', 'mclaren', 'mercedes-benz', 'mitsubishi', 'nissan', 'porsche', 'ram', 'rolls-royce', 'saab', 'scion', 'smart', 'subaru', 'suzuki', 'toyota', 'volkswagen', 'volvo']
|
81
|
-
|
82
|
-
# neg_links = %w(: .biz .co .edu .gov .jpg .net // afri anounc book business buy bye call cash cheap click collis cont distrib download drop event face feature feed financ find fleet form gas generat graphic hello home hospi hour hours http info insta inventory item join login mail mailto mobile movie museu music news none offer part phone policy priva pump rate regist review schedul school service shop site test ticket tire tv twitter watch www yelp youth)
|
83
|
-
|
84
|
-
# neg_hrefs = %w(? .com .jpg @ * afri after anounc apply approved blog book business buy call care career cash charit cheap check click collis commerc cont contrib deal distrib download employ event face feature feed financ find fleet form gas generat golf here holiday hospi hour info insta inventory join later light login mail mobile movie museu music news none now oil part pay phone policy priva pump quick quote rate regist review saving schedul service shop sign site speci ticket tire today transla travel truck tv twitter watch youth)
|
85
|
-
|
86
|
-
neg_exts = %w[au ca edu es gov in ru uk us]
|
87
|
-
|
88
|
-
oa_args = { pos_urls: pos_urls, neg_urls: neg_urls, neg_links: neg_links, neg_hrefs: neg_hrefs, neg_exts: neg_exts }
|
89
|
-
oa_args.compact
|
90
|
-
end
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
73
|
end
|
97
74
|
end
|
data/lib/utf8_sanitizer/utf.rb
CHANGED
@@ -4,7 +4,6 @@
|
|
4
4
|
module Utf8Sanitizer
|
5
5
|
class UTF
|
6
6
|
def initialize(args={})
|
7
|
-
@utf_result = { stats: {}, data: {} }
|
8
7
|
@valid_rows = []
|
9
8
|
@encoded_rows = []
|
10
9
|
@defective_rows = []
|
@@ -18,8 +17,7 @@ module Utf8Sanitizer
|
|
18
17
|
def validate_data(args={})
|
19
18
|
args = args.slice(:file_path, :data, :pollute_seeds)
|
20
19
|
args = args.compact
|
21
|
-
|
22
|
-
@seed = Seed.new if args.fetch(:pollute_seeds)
|
20
|
+
@seed = Seed.new if args[:pollute_seeds]
|
23
21
|
file_path = args[:file_path]
|
24
22
|
data = args[:data]
|
25
23
|
|
@@ -37,12 +35,10 @@ module Utf8Sanitizer
|
|
37
35
|
perfect = groups['perfect']
|
38
36
|
|
39
37
|
header_row_count = @headers.any? ? 1 : 0
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
initialize
|
45
|
-
utf_result
|
38
|
+
utf_result = {
|
39
|
+
stats: { total_rows: @row_id, header_row: header_row_count, valid_rows: @valid_rows.count, error_rows: @error_rows.count, defective_rows: @defective_rows.count, perfect_rows: perfect, encoded_rows: @encoded_rows.count, wchar_rows: wchar },
|
40
|
+
data: { valid_data: @valid_rows, encoded_data: @encoded_rows, defective_data: @defective_rows, error_data: @error_rows }
|
41
|
+
}
|
46
42
|
end
|
47
43
|
|
48
44
|
#################### * VALIDATE CSV * ####################
|
data/lib/utf8_sanitizer.rb
CHANGED
@@ -1,14 +1,28 @@
|
|
1
1
|
require "utf8_sanitizer/version"
|
2
|
-
require 'utf8_sanitizer/run'
|
3
2
|
require 'utf8_sanitizer/seed'
|
4
3
|
require 'utf8_sanitizer/utf'
|
5
4
|
require 'pry'
|
6
5
|
|
7
6
|
module Utf8Sanitizer
|
8
7
|
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
## Args must include :data or :file_path, else seeds will run by default.
|
9
|
+
def self.sanitize(args={})
|
10
|
+
keys = args.compact.keys
|
11
|
+
input = { stats: nil, file_path: nil, data: nil }.merge(args)
|
12
|
+
|
13
|
+
## Grabs seeds if :data or :file_path empty.
|
14
|
+
unless (keys & [:data, :file_path]).any?
|
15
|
+
## Toggle data[:file_path] & data[:data] to test csv parsing or data hashes.
|
16
|
+
# input[:file_path] = Seed.new.grab_seed_file_path
|
17
|
+
input[:data] = Seed.new.grab_seed_hashes
|
18
|
+
|
19
|
+
## For Testing: Pollute_seeds adds non-utf8 chars to each line.
|
20
|
+
input[:pollute_seeds] = true
|
21
|
+
end
|
22
|
+
|
23
|
+
## Sanitizes input hash, then merges results to original input hash, and returns as sanitized_data.
|
24
|
+
input.merge!(Utf8Sanitizer::UTF.new.validate_data(input))
|
12
25
|
end
|
13
26
|
|
27
|
+
|
14
28
|
end
|
data/utf8_sanitizer.gemspec
CHANGED
@@ -14,8 +14,8 @@ Gem::Specification.new do |spec|
|
|
14
14
|
spec.homepage = 'https://github.com/4rlm/utf8_sanitizer'
|
15
15
|
spec.license = 'MIT'
|
16
16
|
|
17
|
-
spec.summary = "
|
18
|
-
spec.description = "
|
17
|
+
spec.summary = "Removes invalid UTF8 characters & extra whitespace from csv or strings."
|
18
|
+
spec.description = "Removes invalid UTF8 characters & extra whitespace (carriage returns, new lines, tabs, spaces, etc.) from csv, or strings.\n Example: ABC Au\\xC1tos,123 E Main St,Anytown,TX,75142,(888) 555-1234\\n\\r\\n => ABC Autos,123 E Main St,Anytown,TX,75142,(888) 555-1234"
|
19
19
|
|
20
20
|
if spec.respond_to?(:metadata)
|
21
21
|
spec.metadata['allowed_push_host'] = "https://rubygems.org"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8_sanitizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.2.pre.rc.
|
4
|
+
version: 0.0.2.pre.rc.03
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Booth
|
@@ -181,7 +181,7 @@ dependencies:
|
|
181
181
|
- !ruby/object:Gem::Version
|
182
182
|
version: 0.11.3
|
183
183
|
description: |-
|
184
|
-
|
184
|
+
Removes invalid UTF8 characters & extra whitespace (carriage returns, new lines, tabs, spaces, etc.) from csv, or strings.
|
185
185
|
Example: ABC Au\xC1tos,123 E Main St,Anytown,TX,75142,(888) 555-1234\n\r\n => ABC Autos,123 E Main St,Anytown,TX,75142,(888) 555-1234
|
186
186
|
email:
|
187
187
|
- 4rlm@protonmail.ch
|
@@ -207,12 +207,10 @@ files:
|
|
207
207
|
- lib/utf8_sanitizer/csv/seeds_mini.csv
|
208
208
|
- lib/utf8_sanitizer/csv/seeds_mini_10.csv
|
209
209
|
- lib/utf8_sanitizer/csv/seeds_mini_2_bug.csv
|
210
|
-
- lib/utf8_sanitizer/run.rb
|
211
210
|
- lib/utf8_sanitizer/seed.rb
|
212
211
|
- lib/utf8_sanitizer/utf.rb
|
213
212
|
- lib/utf8_sanitizer/version.rb
|
214
213
|
- utf8_sanitizer.gemspec
|
215
|
-
- utf8_sanitizer_gemspec_orig.txt
|
216
214
|
homepage: https://github.com/4rlm/utf8_sanitizer
|
217
215
|
licenses:
|
218
216
|
- MIT
|
@@ -237,6 +235,5 @@ rubyforge_project:
|
|
237
235
|
rubygems_version: 2.7.6
|
238
236
|
signing_key:
|
239
237
|
specification_version: 4
|
240
|
-
summary:
|
241
|
-
returns, new lines, tabs, spaces, etc.) from csv, or strings.'
|
238
|
+
summary: Removes invalid UTF8 characters & extra whitespace from csv or strings.
|
242
239
|
test_files: []
|
data/lib/utf8_sanitizer/run.rb
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
|
2
|
-
module Utf8Sanitizer
|
3
|
-
class Run
|
4
|
-
|
5
|
-
def initialize
|
6
|
-
@crm_data = {}
|
7
|
-
end
|
8
|
-
|
9
|
-
|
10
|
-
def import(args={})
|
11
|
-
@crm_data = { stats: nil, data: nil, file_path: nil, criteria: nil }
|
12
|
-
@crm_data.merge!(args)
|
13
|
-
keys = args.compact.keys
|
14
|
-
|
15
|
-
unless (keys & [:data, :file_path]).any?
|
16
|
-
@crm_data[:file_path] = Seed.new.grab_seed_file_path
|
17
|
-
# @crm_data[:data] = Seed.new.grab_seed_hashes
|
18
|
-
@crm_data[:pollute_seeds] = true
|
19
|
-
unless keys.include?(:criteria)
|
20
|
-
@crm_data[:criteria] = Seed.new.grab_seed_web_criteria
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
utf_result = Utf8Sanitizer::UTF.new.validate_data(@crm_data)
|
25
|
-
@crm_data.merge!(utf_result)
|
26
|
-
end
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
end
|
31
|
-
end
|
@@ -1,36 +0,0 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path("../lib", __FILE__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require "utf8_sanitizer/version"
|
5
|
-
|
6
|
-
Gem::Specification.new do |spec|
|
7
|
-
spec.name = "utf8_sanitizer"
|
8
|
-
spec.version = Utf8Sanitizer::VERSION
|
9
|
-
spec.authors = ["Adam Booth"]
|
10
|
-
spec.email = ["4rlm@protonmail.ch"]
|
11
|
-
|
12
|
-
spec.summary = %q{TODO: Write a short summary, because RubyGems requires one.}
|
13
|
-
spec.description = %q{TODO: Write a longer description or delete this line.}
|
14
|
-
spec.homepage = "TODO: Put your gem's website or public repo URL here."
|
15
|
-
spec.license = "MIT"
|
16
|
-
|
17
|
-
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
18
|
-
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
19
|
-
if spec.respond_to?(:metadata)
|
20
|
-
spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
|
21
|
-
else
|
22
|
-
raise "RubyGems 2.0 or newer is required to protect against " \
|
23
|
-
"public gem pushes."
|
24
|
-
end
|
25
|
-
|
26
|
-
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
27
|
-
f.match(%r{^(test|spec|features)/})
|
28
|
-
end
|
29
|
-
spec.bindir = "exe"
|
30
|
-
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
31
|
-
spec.require_paths = ["lib"]
|
32
|
-
|
33
|
-
spec.add_development_dependency "bundler", "~> 1.16"
|
34
|
-
spec.add_development_dependency "rake", "~> 10.0"
|
35
|
-
spec.add_development_dependency "rspec", "~> 3.0"
|
36
|
-
end
|