scrub_db 2.22 → 2.23
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +27 -4
- data/lib/scrub_db/strings.rb +23 -15
- data/lib/scrub_db/version.rb +1 -1
- data/lib/scrub_db/webs.rb +1 -1
- data/lib/webs_criteria.rb +2 -2
- data/scrub_db.gemspec +2 -3
- metadata +8 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 71f435783cd9717b8a67d76e5866ed6a60ce25582e9df4e96ba8690346730b7e
|
4
|
+
data.tar.gz: 84d726d3d4d35ebfa89b71f943aa3c48063fe9fe5ac2bb72fcbf99d420cb8a17
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f032de540a7aa3521f2cb01d66d89339f354d4fedb4d221744b4a046a7f41942296aca6e21fdd16caa0c72c1deb1a7d1d55d610785b584edaac84f0c344ee60
|
7
|
+
data.tar.gz: e31b40b6a7f41be06368812e7796125d2f593f9feee5712fe854b4293117367724dd0ebdc5ecdc93a9c9c54cb7b5f7850a3755b84cb14a1764679601b2b50db0
|
data/Rakefile
CHANGED
@@ -17,15 +17,39 @@ task :console do
|
|
17
17
|
require "active_support/all"
|
18
18
|
ARGV.clear
|
19
19
|
|
20
|
-
scrubbed_webs = run_scrub_webs
|
20
|
+
# scrubbed_webs = run_scrub_webs
|
21
21
|
# scrubbed_proper_strings = run_scrub_proper_strings
|
22
22
|
# scrubbed_strings = run_scrub_strings
|
23
|
-
|
23
|
+
scrubbed_hash = run_scrub_string
|
24
|
+
scrubbed_hash = run_scrub_proper_string
|
25
|
+
binding.pry
|
24
26
|
|
25
27
|
IRB.start
|
26
28
|
end
|
27
29
|
|
28
30
|
|
31
|
+
def run_scrub_string
|
32
|
+
strings_criteria = {
|
33
|
+
pos_criteria: WebsCriteria.seed_pos_urls,
|
34
|
+
neg_criteria: WebsCriteria.seed_neg_urls
|
35
|
+
}
|
36
|
+
string = 'quick auto-approval gmc and bmw-world of AUSTIN tx, INC'
|
37
|
+
strings_obj = ScrubDb::Strings.new(strings_criteria)
|
38
|
+
scrubbed_hash = strings_obj.scrub_string(string)
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
def run_scrub_proper_string
|
43
|
+
strings_criteria = {
|
44
|
+
pos_criteria: WebsCriteria.seed_pos_urls,
|
45
|
+
neg_criteria: WebsCriteria.seed_neg_urls
|
46
|
+
}
|
47
|
+
string = 'quick auto-approval gmc and bmw-world of AUSTIN tx, INC'
|
48
|
+
strings_obj = ScrubDb::Strings.new(strings_criteria)
|
49
|
+
scrubbed_hash = strings_obj.scrub_proper_string(string)
|
50
|
+
end
|
51
|
+
|
52
|
+
|
29
53
|
def run_scrub_strings
|
30
54
|
strings_criteria = {
|
31
55
|
pos_criteria: WebsCriteria.seed_pos_urls,
|
@@ -82,11 +106,10 @@ def run_scrub_proper_strings
|
|
82
106
|
]
|
83
107
|
|
84
108
|
strings_obj = ScrubDb::Strings.new(strings_criteria)
|
85
|
-
scrubbed_proper_strings = strings_obj.scrub_proper_strings(
|
109
|
+
scrubbed_proper_strings = strings_obj.scrub_proper_strings(array_of_strings)
|
86
110
|
end
|
87
111
|
|
88
112
|
|
89
|
-
|
90
113
|
def run_scrub_webs
|
91
114
|
urls = %w[
|
92
115
|
austinchevrolet.not.real
|
data/lib/scrub_db/strings.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
|
2
|
-
|
3
2
|
module ScrubDb
|
4
3
|
class Strings
|
5
4
|
attr_accessor :filter, :empty_criteria
|
@@ -7,30 +6,40 @@ module ScrubDb
|
|
7
6
|
def initialize(criteria={})
|
8
7
|
@empty_criteria = criteria&.empty?
|
9
8
|
@filter = ScrubDb::Filter.new(criteria) unless @empty_criteria
|
9
|
+
@crmf = CrmFormatter
|
10
10
|
end
|
11
11
|
|
12
12
|
def scrub_proper_strings(props=[])
|
13
|
-
prop_hashes =
|
14
|
-
prop_hashes = merge_criteria(prop_hashes)
|
15
|
-
prop_hashes.map! { |prop_hsh| scrub_hash(prop_hsh) }
|
13
|
+
prop_hashes = props.map! { |str| scrub_proper_string(str) }
|
16
14
|
end
|
17
15
|
|
18
|
-
def scrub_strings(
|
19
|
-
str_hashes =
|
20
|
-
str_hashes = merge_criteria(str_hashes)
|
21
|
-
str_hashes.map! { |str_hsh| scrub_hash(str_hsh) }
|
16
|
+
def scrub_strings(strs=[])
|
17
|
+
str_hashes = strs.map! { |str| scrub_string(str) }
|
22
18
|
end
|
23
19
|
|
24
|
-
|
25
|
-
|
20
|
+
|
21
|
+
def scrub_proper_string(string)
|
22
|
+
hsh = @crmf.format_proper(string)
|
23
|
+
hsh = merge_criteria(hsh)
|
24
|
+
hsh = scrub_hash(hsh)
|
26
25
|
end
|
27
26
|
|
28
|
-
def
|
29
|
-
|
30
|
-
|
31
|
-
|
27
|
+
def scrub_string(string)
|
28
|
+
hsh = string_to_hash(string)
|
29
|
+
hsh = merge_criteria(hsh)
|
30
|
+
hsh = scrub_hash(hsh)
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
def string_to_hash(string)
|
35
|
+
hsh = { string: string, pos_criteria: [], neg_criteria: [] }
|
36
|
+
end
|
37
|
+
|
38
|
+
def merge_criteria(hsh)
|
39
|
+
hsh = hsh.merge({ pos_criteria: [], neg_criteria: [] })
|
32
40
|
end
|
33
41
|
|
42
|
+
|
34
43
|
def scrub_hash(hsh)
|
35
44
|
str = hsh[:string]
|
36
45
|
prop = hsh[:proper_f]
|
@@ -48,5 +57,4 @@ module ScrubDb
|
|
48
57
|
end
|
49
58
|
|
50
59
|
end
|
51
|
-
|
52
60
|
end
|
data/lib/scrub_db/version.rb
CHANGED
data/lib/scrub_db/webs.rb
CHANGED
@@ -53,7 +53,7 @@ module ScrubDb
|
|
53
53
|
def scrub_url_hash(url_hash)
|
54
54
|
url = url_hash[:url_f]
|
55
55
|
path = url_hash[:url_path]
|
56
|
-
|
56
|
+
path = url_hash[:path]
|
57
57
|
url_exts = url_hash[:url_exts]
|
58
58
|
|
59
59
|
url_hash = @filter.scrub_oa(url_hash, url_exts, 'neg_exts', 'equal')
|
data/lib/webs_criteria.rb
CHANGED
@@ -37,11 +37,11 @@ class WebsCriteria
|
|
37
37
|
%w(com net)
|
38
38
|
end
|
39
39
|
|
40
|
-
# def self.
|
40
|
+
# def self.seed_neg_paths
|
41
41
|
# %w(? .com .jpg @ * afri after anounc apply approved blog book business buy call care career cash charit cheap check click collis commerc cont contrib deal distrib download employ event face feature feed financ find fleet form gas generat golf here holiday hospi hour info insta inventory join later light login mail mobile movie museu music news none now oil part pay phone policy priva pump quick quote rate regist review saving schedul service shop sign site speci ticket tire today transla travel truck tv twitter watch youth)
|
42
42
|
# end
|
43
43
|
#
|
44
|
-
# def self.
|
44
|
+
# def self.seed_pos_paths
|
45
45
|
# %w(team staff management)
|
46
46
|
# end
|
47
47
|
|
data/scrub_db.gemspec
CHANGED
@@ -37,11 +37,10 @@ Gem::Specification.new do |spec|
|
|
37
37
|
|
38
38
|
spec.required_ruby_version = '~> 2.5.1'
|
39
39
|
spec.add_dependency 'activesupport', '~> 5.2'
|
40
|
+
spec.add_dependency 'crm_formatter', '~> 2.64'
|
40
41
|
spec.add_dependency 'utf8_sanitizer', '~> 2.16'
|
41
|
-
spec.add_dependency 'crm_formatter', '~> 2.61'
|
42
42
|
|
43
|
-
|
44
|
-
# spec.add_development_dependency 'utf8_sanitizer', '~> 2.15'
|
43
|
+
|
45
44
|
# spec.add_dependency "activesupport-inflector", ['~> 0.1.0']
|
46
45
|
spec.add_development_dependency 'bundler', '~> 1.16', '>= 1.16.2'
|
47
46
|
spec.add_development_dependency 'pry', '~> 0.11.3'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrub_db
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '2.
|
4
|
+
version: '2.23'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Booth
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-07-
|
11
|
+
date: 2018-07-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -25,33 +25,33 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '5.2'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: crm_formatter
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '2.
|
33
|
+
version: '2.64'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '2.
|
40
|
+
version: '2.64'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: utf8_sanitizer
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '2.
|
47
|
+
version: '2.16'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '2.
|
54
|
+
version: '2.16'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: bundler
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|