utf8_sanitizer 0.0.2.pre.rc.02 → 0.0.2.pre.rc.03

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a38bc8d107aceca249f57e024c44ac5a29a29d83b6caa6afbfe1b3e5bddc3092
4
- data.tar.gz: 8475996137f588b1a18f335770a08d19db0a5291076abdf0714ca487ed87b8f0
3
+ metadata.gz: 1128d1d63dfb289df84978713ef4fc7f8efc1bfb7d6784ab7accd85684824bff
4
+ data.tar.gz: 54111bdd0d5e3e1c6f97892c5404d68cbaf283068f77a3372e3bc6facb7cd9d3
5
5
  SHA512:
6
- metadata.gz: f1272a9c28b5e4420992a04089ef241a3c9c299aa2643a39b29ee3a411b1dbe18ccb7d0658c9e77a0ead3ae388f046a01e05d024b7558aef562022f3ebf703e9
7
- data.tar.gz: 4df59b0146fb3516bc1f1ce81c8ece8ed1104094681bb2e0ff70dfa1741af8f0cb13117218e2fb488b58f4c94484bca76470728b6fde56b52153f352cec48f0b
6
+ metadata.gz: 0e64c64ab3dafaa19fdc1cac3fae947b8aafe909bde3d747b8a23b8516ad5224769ae690605074a22e85082275e799173e866d58963ef951b62d7183d9503ae7
7
+ data.tar.gz: '0813f8565e88f697799fd3882ed883595c89d5af7a6042a792b1918fe4e97524ed0ba6da5ac0e1e21b0c91dc7ef89766122000cdeffaaa08505fbfbb300a63c5'
data/Rakefile CHANGED
@@ -13,6 +13,7 @@ task :console do
13
13
  require 'utf8_sanitizer' # You know what to do.
14
14
  require "active_support/all"
15
15
  ARGV.clear
16
- Utf8Sanitizer.run_wrap
16
+ # sanitized_data = Utf8Sanitizer.sanitize(file_path: "./lib/utf8_sanitizer/csv/seeds_mini.csv")
17
+ sanitized_data = Utf8Sanitizer.sanitize
17
18
  IRB.start
18
19
  end
@@ -19,11 +19,11 @@ module Utf8Sanitizer
19
19
 
20
20
  def grab_seed_file_path
21
21
  # "./lib/utf8_sanitizer/csv/seeds_clean.csv"
22
- # "./lib/utf8_sanitizer/csv/seeds_dirty.csv"
22
+ "./lib/utf8_sanitizer/csv/seeds_dirty.csv"
23
23
  # "./lib/utf8_sanitizer/csv/seeds_mega.csv"
24
24
  # "./lib/utf8_sanitizer/csv/seeds_mini.csv"
25
25
  # "./lib/utf8_sanitizer/csv/seeds_mini_10.csv"
26
- './lib/utf8_sanitizer/csv/seeds_mini_2_bug.csv'
26
+ # './lib/utf8_sanitizer/csv/seeds_mini_2_bug.csv'
27
27
  end
28
28
 
29
29
  ### Sample Hashes for validate_data
@@ -70,28 +70,5 @@ module Utf8Sanitizer
70
70
  phone: '(757) 490-3651' }]
71
71
  end
72
72
 
73
-
74
-
75
- def grab_seed_web_criteria
76
- pos_urls, neg_urls, neg_links, neg_hrefs, neg_exts = [], [], [], [], []
77
-
78
- neg_urls = %w[approv avis budget collis eat enterprise facebook financ food google gourmet hertz hotel hyatt insur invest loan lube mobility motel motorola parts quick rent repair restaur rv ryder service softwar travel twitter webhost yellowpages yelp youtube]
79
-
80
- pos_urls = ['acura', 'alfa romeo', 'aston martin', 'audi', 'bmw', 'bentley', 'bugatti', 'buick', 'cdjr', 'cadillac', 'chevrolet', 'chrysler', 'dodge', 'ferrari', 'fiat', 'ford', 'gmc', 'group', 'group', 'honda', 'hummer', 'hyundai', 'infiniti', 'isuzu', 'jaguar', 'jeep', 'kia', 'lamborghini', 'lexus', 'lincoln', 'lotus', 'mini', 'maserati', 'mazda', 'mclaren', 'mercedes-benz', 'mitsubishi', 'nissan', 'porsche', 'ram', 'rolls-royce', 'saab', 'scion', 'smart', 'subaru', 'suzuki', 'toyota', 'volkswagen', 'volvo']
81
-
82
- # neg_links = %w(: .biz .co .edu .gov .jpg .net // afri anounc book business buy bye call cash cheap click collis cont distrib download drop event face feature feed financ find fleet form gas generat graphic hello home hospi hour hours http info insta inventory item join login mail mailto mobile movie museu music news none offer part phone policy priva pump rate regist review schedul school service shop site test ticket tire tv twitter watch www yelp youth)
83
-
84
- # neg_hrefs = %w(? .com .jpg @ * afri after anounc apply approved blog book business buy call care career cash charit cheap check click collis commerc cont contrib deal distrib download employ event face feature feed financ find fleet form gas generat golf here holiday hospi hour info insta inventory join later light login mail mobile movie museu music news none now oil part pay phone policy priva pump quick quote rate regist review saving schedul service shop sign site speci ticket tire today transla travel truck tv twitter watch youth)
85
-
86
- neg_exts = %w[au ca edu es gov in ru uk us]
87
-
88
- oa_args = { pos_urls: pos_urls, neg_urls: neg_urls, neg_links: neg_links, neg_hrefs: neg_hrefs, neg_exts: neg_exts }
89
- oa_args.compact
90
- end
91
-
92
-
93
-
94
-
95
-
96
73
  end
97
74
  end
@@ -4,7 +4,6 @@
4
4
  module Utf8Sanitizer
5
5
  class UTF
6
6
  def initialize(args={})
7
- @utf_result = { stats: {}, data: {} }
8
7
  @valid_rows = []
9
8
  @encoded_rows = []
10
9
  @defective_rows = []
@@ -18,8 +17,7 @@ module Utf8Sanitizer
18
17
  def validate_data(args={})
19
18
  args = args.slice(:file_path, :data, :pollute_seeds)
20
19
  args = args.compact
21
-
22
- @seed = Seed.new if args.fetch(:pollute_seeds)
20
+ @seed = Seed.new if args[:pollute_seeds]
23
21
  file_path = args[:file_path]
24
22
  data = args[:data]
25
23
 
@@ -37,12 +35,10 @@ module Utf8Sanitizer
37
35
  perfect = groups['perfect']
38
36
 
39
37
  header_row_count = @headers.any? ? 1 : 0
40
- stats = { total_rows: @row_id, header_row: header_row_count, valid_rows: @valid_rows.count, error_rows: @error_rows.count, defective_rows: @defective_rows.count, perfect_rows: perfect, encoded_rows: @encoded_rows.count, wchar_rows: wchar }
41
- data = { valid_data: @valid_rows, encoded_data: @encoded_rows, defective_data: @defective_rows, error_data: @error_rows }
42
- @utf_result = { stats: stats, data: data }
43
- utf_result = @utf_result
44
- initialize
45
- utf_result
38
+ utf_result = {
39
+ stats: { total_rows: @row_id, header_row: header_row_count, valid_rows: @valid_rows.count, error_rows: @error_rows.count, defective_rows: @defective_rows.count, perfect_rows: perfect, encoded_rows: @encoded_rows.count, wchar_rows: wchar },
40
+ data: { valid_data: @valid_rows, encoded_data: @encoded_rows, defective_data: @defective_rows, error_data: @error_rows }
41
+ }
46
42
  end
47
43
 
48
44
  #################### * VALIDATE CSV * ####################
@@ -1,4 +1,4 @@
1
1
  module Utf8Sanitizer
2
2
  # VERSION = "0.0.1-rc.1"
3
- VERSION = "0.0.2.pre.rc.02"
3
+ VERSION = "0.0.2.pre.rc.03"
4
4
  end
@@ -1,14 +1,28 @@
1
1
  require "utf8_sanitizer/version"
2
- require 'utf8_sanitizer/run'
3
2
  require 'utf8_sanitizer/seed'
4
3
  require 'utf8_sanitizer/utf'
5
4
  require 'pry'
6
5
 
7
6
  module Utf8Sanitizer
8
7
 
9
- def self.run_wrap
10
- run = self::Run.new
11
- result = run.import ## returns formatted urls.
8
+ ## Args must include :data or :file_path, else seeds will run by default.
9
+ def self.sanitize(args={})
10
+ keys = args.compact.keys
11
+ input = { stats: nil, file_path: nil, data: nil }.merge(args)
12
+
13
+ ## Grabs seeds if :data or :file_path empty.
14
+ unless (keys & [:data, :file_path]).any?
15
+ ## Toggle data[:file_path] & data[:data] to test csv parsing or data hashes.
16
+ # input[:file_path] = Seed.new.grab_seed_file_path
17
+ input[:data] = Seed.new.grab_seed_hashes
18
+
19
+ ## For Testing: Pollute_seeds adds non-utf8 chars to each line.
20
+ input[:pollute_seeds] = true
21
+ end
22
+
23
+ ## Sanitizes input hash, then merges results to original input hash, and returns as sanitized_data.
24
+ input.merge!(Utf8Sanitizer::UTF.new.validate_data(input))
12
25
  end
13
26
 
27
+
14
28
  end
@@ -14,8 +14,8 @@ Gem::Specification.new do |spec|
14
14
  spec.homepage = 'https://github.com/4rlm/utf8_sanitizer'
15
15
  spec.license = 'MIT'
16
16
 
17
- spec.summary = "Still in BETA: Removes invalid UTF8 characters, and extra whitespace (carriage returns, new lines, tabs, spaces, etc.) from csv, or strings."
18
- spec.description = "Still in BETA: Removes invalid UTF8 characters, and extra whitespace (carriage returns, new lines, tabs, spaces, etc.) from csv, or strings.\n Example: ABC Au\\xC1tos,123 E Main St,Anytown,TX,75142,(888) 555-1234\\n\\r\\n => ABC Autos,123 E Main St,Anytown,TX,75142,(888) 555-1234"
17
+ spec.summary = "Removes invalid UTF8 characters & extra whitespace from csv or strings."
18
+ spec.description = "Removes invalid UTF8 characters & extra whitespace (carriage returns, new lines, tabs, spaces, etc.) from csv, or strings.\n Example: ABC Au\\xC1tos,123 E Main St,Anytown,TX,75142,(888) 555-1234\\n\\r\\n => ABC Autos,123 E Main St,Anytown,TX,75142,(888) 555-1234"
19
19
 
20
20
  if spec.respond_to?(:metadata)
21
21
  spec.metadata['allowed_push_host'] = "https://rubygems.org"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8_sanitizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2.pre.rc.02
4
+ version: 0.0.2.pre.rc.03
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Booth
@@ -181,7 +181,7 @@ dependencies:
181
181
  - !ruby/object:Gem::Version
182
182
  version: 0.11.3
183
183
  description: |-
184
- Still in BETA: Removes invalid UTF8 characters, and extra whitespace (carriage returns, new lines, tabs, spaces, etc.) from csv, or strings.
184
+ Removes invalid UTF8 characters & extra whitespace (carriage returns, new lines, tabs, spaces, etc.) from csv, or strings.
185
185
  Example: ABC Au\xC1tos,123 E Main St,Anytown,TX,75142,(888) 555-1234\n\r\n => ABC Autos,123 E Main St,Anytown,TX,75142,(888) 555-1234
186
186
  email:
187
187
  - 4rlm@protonmail.ch
@@ -207,12 +207,10 @@ files:
207
207
  - lib/utf8_sanitizer/csv/seeds_mini.csv
208
208
  - lib/utf8_sanitizer/csv/seeds_mini_10.csv
209
209
  - lib/utf8_sanitizer/csv/seeds_mini_2_bug.csv
210
- - lib/utf8_sanitizer/run.rb
211
210
  - lib/utf8_sanitizer/seed.rb
212
211
  - lib/utf8_sanitizer/utf.rb
213
212
  - lib/utf8_sanitizer/version.rb
214
213
  - utf8_sanitizer.gemspec
215
- - utf8_sanitizer_gemspec_orig.txt
216
214
  homepage: https://github.com/4rlm/utf8_sanitizer
217
215
  licenses:
218
216
  - MIT
@@ -237,6 +235,5 @@ rubyforge_project:
237
235
  rubygems_version: 2.7.6
238
236
  signing_key:
239
237
  specification_version: 4
240
- summary: 'Still in BETA: Removes invalid UTF8 characters, and extra whitespace (carriage
241
- returns, new lines, tabs, spaces, etc.) from csv, or strings.'
238
+ summary: Removes invalid UTF8 characters & extra whitespace from csv or strings.
242
239
  test_files: []
@@ -1,31 +0,0 @@
1
-
2
- module Utf8Sanitizer
3
- class Run
4
-
5
- def initialize
6
- @crm_data = {}
7
- end
8
-
9
-
10
- def import(args={})
11
- @crm_data = { stats: nil, data: nil, file_path: nil, criteria: nil }
12
- @crm_data.merge!(args)
13
- keys = args.compact.keys
14
-
15
- unless (keys & [:data, :file_path]).any?
16
- @crm_data[:file_path] = Seed.new.grab_seed_file_path
17
- # @crm_data[:data] = Seed.new.grab_seed_hashes
18
- @crm_data[:pollute_seeds] = true
19
- unless keys.include?(:criteria)
20
- @crm_data[:criteria] = Seed.new.grab_seed_web_criteria
21
- end
22
- end
23
-
24
- utf_result = Utf8Sanitizer::UTF.new.validate_data(@crm_data)
25
- @crm_data.merge!(utf_result)
26
- end
27
-
28
-
29
-
30
- end
31
- end
@@ -1,36 +0,0 @@
1
-
2
- lib = File.expand_path("../lib", __FILE__)
3
- $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require "utf8_sanitizer/version"
5
-
6
- Gem::Specification.new do |spec|
7
- spec.name = "utf8_sanitizer"
8
- spec.version = Utf8Sanitizer::VERSION
9
- spec.authors = ["Adam Booth"]
10
- spec.email = ["4rlm@protonmail.ch"]
11
-
12
- spec.summary = %q{TODO: Write a short summary, because RubyGems requires one.}
13
- spec.description = %q{TODO: Write a longer description or delete this line.}
14
- spec.homepage = "TODO: Put your gem's website or public repo URL here."
15
- spec.license = "MIT"
16
-
17
- # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
- # to allow pushing to a single host or delete this section to allow pushing to any host.
19
- if spec.respond_to?(:metadata)
20
- spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
21
- else
22
- raise "RubyGems 2.0 or newer is required to protect against " \
23
- "public gem pushes."
24
- end
25
-
26
- spec.files = `git ls-files -z`.split("\x0").reject do |f|
27
- f.match(%r{^(test|spec|features)/})
28
- end
29
- spec.bindir = "exe"
30
- spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
31
- spec.require_paths = ["lib"]
32
-
33
- spec.add_development_dependency "bundler", "~> 1.16"
34
- spec.add_development_dependency "rake", "~> 10.0"
35
- spec.add_development_dependency "rspec", "~> 3.0"
36
- end