scrub_db 0.0.1.pre.rc.02 → 0.0.1.pre.rc.03
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/README.md +229 -7
- data/Rakefile +41 -0
- data/lib/scrub_db/filter.rb +114 -0
- data/lib/scrub_db/version.rb +1 -1
- data/lib/scrub_db/web.rb +108 -0
- data/lib/scrub_db.rb +4 -2
- data/lib/web_criteria.rb +55 -0
- data/scrub_db.gemspec +7 -3
- metadata +25 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 38b27fd85ba16c4f14ca0542699874b3a101f17eb1faf10fadf163cee9e43d20
|
4
|
+
data.tar.gz: 321ca87b878ac6e66c7788da976e28dc49fa14d83e77d41f19aafb6c26c4ad7f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6f6c11ca3c7b1575c23d811cdf9e977b1dd2a0a198533260c42e5fd18f3146c1d8b4bdcb73615c094f34a2463c670ec26ee5a4a74ac3fa29ce9f4e0c6949f156
|
7
|
+
data.tar.gz: 9ed5a5cbecd487b94f6a99b65e666baba8dc19938d0b111fdd573790aa530d627b79403759996d4ac2188690b6bab9b37e17834f5e948f33638767c05d7c7704
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,8 +1,5 @@
|
|
1
1
|
# ScrubDb
|
2
|
-
|
3
|
-
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/scrub_db`. To experiment with that code, run `bin/console` for an interactive prompt.
|
4
|
-
|
5
|
-
TODO: Delete this and the text above, and describe your gem
|
2
|
+
#### Scrub data with your custom criteria. Returns detailed reporting.
|
6
3
|
|
7
4
|
## Installation
|
8
5
|
|
@@ -22,7 +19,232 @@ Or install it yourself as:
|
|
22
19
|
|
23
20
|
## Usage
|
24
21
|
|
25
|
-
|
22
|
+
More methods coming soon. Currently, Scrub Array of URLs is fully functional.
|
23
|
+
|
24
|
+
### 1. Scrub Array of URLs:
|
25
|
+
This is an example of scrubbing auto dealership urls. We only want URLs based in the US, and paths of the staff. Most of our URLs are good, but we want to confirm that they all meet our requirements.
|
26
|
+
|
27
|
+
### A. Pass in Scrub Criteria
|
28
|
+
First step is to load your web criteria in hash format. It's not required to enter all the keys below, but for those you are using, each key must be a symbol and be exactly the same as the ones below. The values must each be an array of strings.
|
29
|
+
|
30
|
+
```
|
31
|
+
criteria = {
|
32
|
+
neg_urls: %w[pprov avis budget collis eat],
|
33
|
+
pos_urls: %w[acura audi bmw bentley],
|
34
|
+
neg_paths: %w[buy bye call cash cheap click collis cont distrib],
|
35
|
+
pos_paths: %w[team staff management],
|
36
|
+
neg_exts: %w[au ca edu es gov in ru uk us],
|
37
|
+
pos_exts: %w[com net]
|
38
|
+
}
|
39
|
+
|
40
|
+
web_obj = ScrubDb::Web.new(criteria)
|
41
|
+
```
|
42
|
+
|
43
|
+
### B. Pass in URLs List
|
44
|
+
Next, pass your list of URLs to `scrub_urls(urls)` with the syntax below.
|
45
|
+
|
46
|
+
```
|
47
|
+
urls = %w[
|
48
|
+
smith_acura.com/staff
|
49
|
+
abcrepair.ca
|
50
|
+
austinchevrolet.not.real
|
51
|
+
hertzrentals.com/review
|
52
|
+
londonhyundai.uk/fleet
|
53
|
+
http://www.townbuick.net/staff
|
54
|
+
http://youtube.com/download
|
55
|
+
www.madridinfiniti.es/collision
|
56
|
+
www.mitsubishideals.sofake
|
57
|
+
www.dallassubaru.com.sofake
|
58
|
+
www.quickeats.net/contact_us
|
59
|
+
www.school.edu/teachers
|
60
|
+
www.www.nissancars/inventory
|
61
|
+
www.www.toyotatown.net/staff/management
|
62
|
+
www.www.yellowpages.com/business
|
63
|
+
]
|
64
|
+
|
65
|
+
scrubbed_web_hashes = web_obj.scrub_urls(urls)
|
66
|
+
```
|
67
|
+
|
68
|
+
### C. Returned Results
|
69
|
+
Notice that the URLs in the list above are NOT uniformly formatted. ScrubDb leverages the `Utf8Sanitizer gem` and `CrmFormatter gem` to first format the URLs. Then, it passes the formatted URL hashes to be scrubbed based on the criteria passes in earlier. The results will be returned in the syntax below:
|
70
|
+
|
71
|
+
```
|
72
|
+
scrubbed_web_hashes = [
|
73
|
+
{
|
74
|
+
web_status: 'formatted',
|
75
|
+
url: 'smith_acura.com/staff',
|
76
|
+
url_f: 'http://www.smith_acura.com',
|
77
|
+
url_path: '/staff',
|
78
|
+
web_neg: nil,
|
79
|
+
url_exts: ['com'],
|
80
|
+
neg_exts: [],
|
81
|
+
pos_exts: ['com'],
|
82
|
+
neg_paths: [],
|
83
|
+
pos_paths: ['staff'],
|
84
|
+
neg_urls: [],
|
85
|
+
pos_urls: ['acura']
|
86
|
+
},
|
87
|
+
{
|
88
|
+
web_status: 'formatted',
|
89
|
+
url: 'abcrepair.ca',
|
90
|
+
url_f: 'http://www.abcrepair.ca',
|
91
|
+
url_path: nil,
|
92
|
+
web_neg: nil,
|
93
|
+
url_exts: ['ca'],
|
94
|
+
neg_exts: ['ca'],
|
95
|
+
pos_exts: [],
|
96
|
+
neg_paths: [],
|
97
|
+
pos_paths: [],
|
98
|
+
neg_urls: ['repair'],
|
99
|
+
pos_urls: []
|
100
|
+
},
|
101
|
+
{
|
102
|
+
web_status: 'formatted',
|
103
|
+
url: 'hertzrentals.com/review',
|
104
|
+
url_f: 'http://www.hertzrentals.com',
|
105
|
+
url_path: '/review',
|
106
|
+
web_neg: nil,
|
107
|
+
url_exts: ['com'],
|
108
|
+
neg_exts: [],
|
109
|
+
pos_exts: ['com'],
|
110
|
+
neg_paths: ['review'],
|
111
|
+
pos_paths: [],
|
112
|
+
neg_urls: ['hertz, rent'],
|
113
|
+
pos_urls: []
|
114
|
+
},
|
115
|
+
{
|
116
|
+
web_status: 'formatted',
|
117
|
+
url: 'londonhyundai.uk/fleet',
|
118
|
+
url_f: 'http://www.londonhyundai.uk',
|
119
|
+
url_path: '/fleet',
|
120
|
+
web_neg: nil,
|
121
|
+
url_exts: ['uk'],
|
122
|
+
neg_exts: ['uk'],
|
123
|
+
pos_exts: [],
|
124
|
+
neg_paths: ['fleet'],
|
125
|
+
pos_paths: [],
|
126
|
+
neg_urls: [],
|
127
|
+
pos_urls: ['hyundai']
|
128
|
+
},
|
129
|
+
{
|
130
|
+
web_status: 'formatted',
|
131
|
+
url: 'http://www.townbuick.net/staff',
|
132
|
+
url_f: 'http://www.townbuick.net',
|
133
|
+
url_path: nil,
|
134
|
+
web_neg: nil,
|
135
|
+
url_exts: ['net'],
|
136
|
+
neg_exts: [],
|
137
|
+
pos_exts: ['net'],
|
138
|
+
neg_paths: [],
|
139
|
+
pos_paths: [],
|
140
|
+
neg_urls: [],
|
141
|
+
pos_urls: ['buick']
|
142
|
+
},
|
143
|
+
{
|
144
|
+
web_status: 'formatted',
|
145
|
+
url: 'http://youtube.com/download',
|
146
|
+
url_f: 'http://www.youtube.com',
|
147
|
+
url_path: nil,
|
148
|
+
web_neg: nil,
|
149
|
+
url_exts: ['com'],
|
150
|
+
neg_exts: [],
|
151
|
+
pos_exts: ['com'],
|
152
|
+
neg_paths: [],
|
153
|
+
pos_paths: [],
|
154
|
+
neg_urls: ['youtube'],
|
155
|
+
pos_urls: []
|
156
|
+
},
|
157
|
+
{
|
158
|
+
web_status: 'formatted',
|
159
|
+
url: 'www.madridinfiniti.es/collision',
|
160
|
+
url_f: 'http://www.madridinfiniti.es',
|
161
|
+
url_path: '/collision',
|
162
|
+
web_neg: nil,
|
163
|
+
url_exts: ['es'],
|
164
|
+
neg_exts: ['es'],
|
165
|
+
pos_exts: [],
|
166
|
+
neg_paths: ['collis'],
|
167
|
+
pos_paths: [],
|
168
|
+
neg_urls: [],
|
169
|
+
pos_urls: ['infiniti']
|
170
|
+
},
|
171
|
+
{
|
172
|
+
web_status: 'formatted',
|
173
|
+
url: 'www.dallassubaru.com.sofake',
|
174
|
+
url_f: 'http://www.dallassubaru.com',
|
175
|
+
url_path: nil,
|
176
|
+
web_neg: nil,
|
177
|
+
url_exts: ['com'],
|
178
|
+
neg_exts: [],
|
179
|
+
pos_exts: ['com'],
|
180
|
+
neg_paths: [],
|
181
|
+
pos_paths: [],
|
182
|
+
neg_urls: [],
|
183
|
+
pos_urls: ['subaru']
|
184
|
+
},
|
185
|
+
{
|
186
|
+
web_status: 'formatted',
|
187
|
+
url: 'www.quickeats.net/contact_us',
|
188
|
+
url_f: 'http://www.quickeats.net',
|
189
|
+
url_path: '/contact_us',
|
190
|
+
web_neg: nil,
|
191
|
+
url_exts: ['net'],
|
192
|
+
neg_exts: [],
|
193
|
+
pos_exts: ['net'],
|
194
|
+
neg_paths: ['cont'],
|
195
|
+
pos_paths: [],
|
196
|
+
neg_urls: ['eat, quick'],
|
197
|
+
pos_urls: []
|
198
|
+
},
|
199
|
+
{
|
200
|
+
web_status: 'formatted',
|
201
|
+
url: 'www.school.edu/teachers',
|
202
|
+
url_f: 'http://www.school.edu',
|
203
|
+
url_path: '/teachers',
|
204
|
+
web_neg: nil,
|
205
|
+
url_exts: ['edu'],
|
206
|
+
neg_exts: ['edu'],
|
207
|
+
pos_exts: [],
|
208
|
+
neg_paths: [],
|
209
|
+
pos_paths: [],
|
210
|
+
neg_urls: [],
|
211
|
+
pos_urls: []
|
212
|
+
},
|
213
|
+
{
|
214
|
+
web_status: 'formatted',
|
215
|
+
url: 'www.www.toyotatown.net/staff/management',
|
216
|
+
url_f: 'http://www.toyotatown.net',
|
217
|
+
url_path: '/staff/management',
|
218
|
+
web_neg: nil,
|
219
|
+
url_exts: ['net'],
|
220
|
+
neg_exts: [],
|
221
|
+
pos_exts: ['net'],
|
222
|
+
neg_paths: [],
|
223
|
+
pos_paths: ['staff, management'],
|
224
|
+
neg_urls: [],
|
225
|
+
pos_urls: ['toyota']
|
226
|
+
},
|
227
|
+
{
|
228
|
+
web_status: 'formatted',
|
229
|
+
url: 'www.www.yellowpages.com/business',
|
230
|
+
url_f: 'http://www.yellowpages.com',
|
231
|
+
url_path: '/business',
|
232
|
+
web_neg: nil,
|
233
|
+
url_exts: ['com'],
|
234
|
+
neg_exts: [],
|
235
|
+
pos_exts: ['com'],
|
236
|
+
neg_paths: ['business'],
|
237
|
+
pos_paths: [],
|
238
|
+
neg_urls: ['yellowpages'],
|
239
|
+
pos_urls: []
|
240
|
+
}
|
241
|
+
]
|
242
|
+
```
|
243
|
+
|
244
|
+
|
245
|
+
## Author
|
246
|
+
|
247
|
+
Adam J Booth - [4rlm](https://github.com/4rlm)
|
26
248
|
|
27
249
|
## Development
|
28
250
|
|
@@ -32,7 +254,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
32
254
|
|
33
255
|
## Contributing
|
34
256
|
|
35
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
257
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/4rlm/scrub_db. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
36
258
|
|
37
259
|
## License
|
38
260
|
|
@@ -40,4 +262,4 @@ The gem is available as open source under the terms of the [MIT License](https:/
|
|
40
262
|
|
41
263
|
## Code of Conduct
|
42
264
|
|
43
|
-
Everyone interacting in the ScrubDb project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/
|
265
|
+
Everyone interacting in the ScrubDb project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/4rlm/scrub_db/blob/master/CODE_OF_CONDUCT.md).
|
data/Rakefile
CHANGED
@@ -1,6 +1,47 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
2
|
require "rspec/core/rake_task"
|
3
|
+
require 'scrub_db'
|
4
|
+
require 'web_criteria'
|
5
|
+
|
3
6
|
|
4
7
|
RSpec::Core::RakeTask.new(:spec)
|
5
8
|
|
6
9
|
task :default => :spec
|
10
|
+
task :test => :spec
|
11
|
+
|
12
|
+
###################
|
13
|
+
task :console do
|
14
|
+
require 'irb'
|
15
|
+
require 'irb/completion'
|
16
|
+
require 'scrub_db'
|
17
|
+
require "active_support/all"
|
18
|
+
ARGV.clear
|
19
|
+
|
20
|
+
scrubbed_urls = scrub_sample_urls
|
21
|
+
binding.pry
|
22
|
+
|
23
|
+
IRB.start
|
24
|
+
end
|
25
|
+
|
26
|
+
def scrub_sample_urls
|
27
|
+
urls = %w[
|
28
|
+
smith_acura.com/staff
|
29
|
+
abcrepair.ca
|
30
|
+
austinchevrolet.not.real
|
31
|
+
hertzrentals.com/review
|
32
|
+
londonhyundai.uk/fleet
|
33
|
+
http://www.townbuick.net/staff
|
34
|
+
http://youtube.com/download
|
35
|
+
www.madridinfiniti.es/collision
|
36
|
+
www.mitsubishideals.sofake
|
37
|
+
www.dallassubaru.com.sofake
|
38
|
+
www.quickeats.net/contact_us
|
39
|
+
www.school.edu/teachers
|
40
|
+
www.www.nissancars/inventory
|
41
|
+
www.www.toyotatown.net/staff/management
|
42
|
+
www.www.yellowpages.com/business
|
43
|
+
]
|
44
|
+
|
45
|
+
web_obj = ScrubDb::Web.new(WebCriteria.all_web_criteria)
|
46
|
+
scrubbed_webs = web_obj.scrub_urls(urls)
|
47
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
# frozen_string_literal: false
|
2
|
+
|
3
|
+
module ScrubDb
|
4
|
+
class Filter
|
5
|
+
|
6
|
+
def initialize(args={})
|
7
|
+
@args = args
|
8
|
+
# @global_hash = grab_global_hash
|
9
|
+
@empty_criteria = args.empty?
|
10
|
+
end
|
11
|
+
|
12
|
+
def scrub_oa(hash, target, oa_name, include_or_equal)
|
13
|
+
return hash unless oa_name.present? && !@empty_criteria && target.present?
|
14
|
+
criteria = @args.fetch(oa_name.to_sym, [])
|
15
|
+
|
16
|
+
return hash unless criteria.any?
|
17
|
+
tars = target.is_a?(::String) ? target.split(', ') : target
|
18
|
+
binding.pry if !tars.present?
|
19
|
+
|
20
|
+
scrub_matches = tars.map do |tar|
|
21
|
+
return hash unless criteria.present?
|
22
|
+
if include_or_equal == 'include'
|
23
|
+
criteria.select { |crit| crit if tar.include?(crit) }.join(', ')
|
24
|
+
elsif include_or_equal == 'equal'
|
25
|
+
criteria.select { |crit| crit if tar == crit }.join(', ')
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
scrub_match = scrub_matches&.uniq&.sort&.join(', ')
|
30
|
+
return hash unless scrub_match.present?
|
31
|
+
|
32
|
+
hash[oa_name.to_sym] << scrub_match
|
33
|
+
hash
|
34
|
+
|
35
|
+
### Delete below after testing above. ###
|
36
|
+
# scrub_match = scrub_matches&.uniq&.sort&.join(', ')
|
37
|
+
# return hash unless scrub_match.present?
|
38
|
+
# if oa_name.include?('web_neg')
|
39
|
+
# hash[:web_neg] << "#{oa_name}: #{scrub_match}"
|
40
|
+
# else
|
41
|
+
# hash[:web_pos] << "#{oa_name}: #{scrub_match}"
|
42
|
+
# end
|
43
|
+
end
|
44
|
+
######################################
|
45
|
+
|
46
|
+
|
47
|
+
# def grab_global_hash
|
48
|
+
# keys = %i[row_id act_name street city state zip full_addr phone url street_f city_f state_f zip_f full_addr_f phone_f url_f url_path web_neg address_status phone_status web_status utf_status]
|
49
|
+
# @global_hash = Hash[keys.map { |a| [a, nil] }]
|
50
|
+
# end
|
51
|
+
|
52
|
+
# def update_global_hash(local_keys)
|
53
|
+
# gkeys = @global_hash.keys
|
54
|
+
# lkeys = local_keys.uniq.sort
|
55
|
+
# # lkeys = lkeys.map(&:to_sym)
|
56
|
+
# # gkeys = gkeys.map(&:to_sym)
|
57
|
+
# add_to_global = lkeys - gkeys
|
58
|
+
# same_keys = lkeys && gkeys
|
59
|
+
# add_to_global += same_keys - gkeys
|
60
|
+
# add_to_global&.uniq!
|
61
|
+
#
|
62
|
+
# if add_to_global.any?
|
63
|
+
# add_to_global += gkeys
|
64
|
+
# row = add_to_global.map { |_| nil }
|
65
|
+
# @global_hash = row_to_hsh(global_keys, row)
|
66
|
+
# end
|
67
|
+
# end
|
68
|
+
|
69
|
+
# def row_to_hsh(headers, row)
|
70
|
+
# headers = headers.map(&:to_sym)
|
71
|
+
# hash = Hash[headers.zip(row)]
|
72
|
+
# end
|
73
|
+
|
74
|
+
|
75
|
+
# def letter_case_check(str)
|
76
|
+
# return unless str.present?
|
77
|
+
# flashes = str&.gsub(/[^ A-Za-z]/, '')&.strip&.split(' ')
|
78
|
+
# flash = flashes&.reject { |e| e.length < 3 }&.join(' ')
|
79
|
+
#
|
80
|
+
# return str unless flash.present?
|
81
|
+
# has_caps = flash.scan(/[A-Z]/).any?
|
82
|
+
# has_lows = flash.scan(/[a-z]/).any?
|
83
|
+
#
|
84
|
+
# return str unless !has_caps || !has_lows
|
85
|
+
# str = str.split(' ')&.each { |el| el.capitalize! if el.gsub(/[^ A-Za-z]/, '')&.strip&.length > 2 }&.join(' ')
|
86
|
+
# end
|
87
|
+
|
88
|
+
### Save for later. ###
|
89
|
+
|
90
|
+
### These two methods can set instance vars from args.
|
91
|
+
# def set(name, value)
|
92
|
+
# var_name = "@#{name}" # the '@' is required
|
93
|
+
# self.instance_variable_set(var_name, value)
|
94
|
+
# end
|
95
|
+
|
96
|
+
# def set_args(args, inst_vars)
|
97
|
+
# return unless args.any?
|
98
|
+
# args.symbolize_keys!
|
99
|
+
# keys_to_slice = (args.keys.uniq) & inst_vars
|
100
|
+
# args.slice!(*keys_to_slice)
|
101
|
+
# args
|
102
|
+
# end
|
103
|
+
|
104
|
+
# def compare_diff(hsh)
|
105
|
+
# res = []
|
106
|
+
# hsh.to_a.reduce do |el, nxt|
|
107
|
+
# res << nxt.first if el.last != nxt.last
|
108
|
+
# el = nxt
|
109
|
+
# end
|
110
|
+
# res.compact!
|
111
|
+
# end
|
112
|
+
|
113
|
+
end
|
114
|
+
end
|
data/lib/scrub_db/version.rb
CHANGED
data/lib/scrub_db/web.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module ScrubDb
|
4
|
+
class Web
|
5
|
+
# attr_accessor :headers, :valid_rows, :encoded_rows, :row_id, :data_hash, :defective_rows, :error_rows
|
6
|
+
|
7
|
+
def initialize(criteria={})
|
8
|
+
@empty_criteria = criteria&.empty?
|
9
|
+
@filter = ScrubDb::Filter.new(criteria) unless @empty_criteria
|
10
|
+
end
|
11
|
+
|
12
|
+
def scrub_urls(urls=[])
|
13
|
+
formatted_url_hashes = CrmFormatter.format_urls(urls)
|
14
|
+
formatted_url_hashes = merge_criteria_hashes(formatted_url_hashes)
|
15
|
+
|
16
|
+
formatted_url_hashes.map! do |url_hash|
|
17
|
+
if url_hash[:web_status] != 'invalid' && url_hash[:url_f].present?
|
18
|
+
url_hash[:url_exts] = extract_exts(url_hash)
|
19
|
+
url_hash = scrub_url_hash(url_hash)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def merge_criteria_hashes(hashes)
|
25
|
+
hashes.map! do |url_hash|
|
26
|
+
merge_criteria_hash(url_hash)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def merge_criteria_hash(url_hash)
|
31
|
+
url_hash.merge!(
|
32
|
+
{
|
33
|
+
url_exts: [],
|
34
|
+
neg_exts: [],
|
35
|
+
pos_exts: [],
|
36
|
+
neg_paths: [],
|
37
|
+
pos_paths: [],
|
38
|
+
neg_urls: [],
|
39
|
+
pos_urls: []
|
40
|
+
}
|
41
|
+
)
|
42
|
+
end
|
43
|
+
|
44
|
+
def extract_exts(url_hash)
|
45
|
+
uri_parts = URI(url_hash[:url_f]).host&.split('.')
|
46
|
+
url_exts = uri_parts[2..-1]
|
47
|
+
end
|
48
|
+
|
49
|
+
def scrub_url_hash(url_hash)
|
50
|
+
url = url_hash[:url_f]
|
51
|
+
path = url_hash[:url_path]
|
52
|
+
href = url_hash[:href]
|
53
|
+
url_exts = url_hash[:url_exts]
|
54
|
+
|
55
|
+
url_hash = @filter.scrub_oa(url_hash, url_exts, 'neg_exts', 'equal')
|
56
|
+
url_hash = @filter.scrub_oa(url_hash, url_exts, 'pos_exts', 'equal')
|
57
|
+
url_hash = @filter.scrub_oa(url_hash, url, 'neg_urls', 'include')
|
58
|
+
url_hash = @filter.scrub_oa(url_hash, url, 'pos_urls', 'include')
|
59
|
+
url_hash = @filter.scrub_oa(url_hash, path, 'neg_paths', 'include')
|
60
|
+
url_hash = @filter.scrub_oa(url_hash, path, 'pos_paths', 'include')
|
61
|
+
url_hash
|
62
|
+
end
|
63
|
+
|
64
|
+
# def remove_invalid_links(link)
|
65
|
+
# link_hsh = { link: link, valid_link: nil, flags: nil }
|
66
|
+
# return link_hsh unless link.present?
|
67
|
+
# @neg_paths += get_symbs
|
68
|
+
# flags = @neg_paths.select { |red| link&.include?(red) }
|
69
|
+
# flags << "below #{2}" if link.length < 2
|
70
|
+
# flags << "over #{100}" if link.length > 100
|
71
|
+
# flags = flags.flatten.compact
|
72
|
+
# valid_link = flags.any? ? nil : link
|
73
|
+
# link_hsh[:valid_link] = valid_link
|
74
|
+
# link_hsh[:flags] = flags.join(', ')
|
75
|
+
# binding.pry
|
76
|
+
# link_hsh
|
77
|
+
# end
|
78
|
+
|
79
|
+
# def remove_invalid_hrefs(href)
|
80
|
+
# href_hsh = { href: href, valid_href: nil, flags: nil }
|
81
|
+
# return href_hsh unless href.present?
|
82
|
+
# @neg_hrefs += get_symbs
|
83
|
+
# href = href.split('|').join(' ')
|
84
|
+
# href = href.split('/').join(' ')
|
85
|
+
# href&.gsub!('(', ' ')
|
86
|
+
# href&.gsub!(')', ' ')
|
87
|
+
# href&.gsub!('[', ' ')
|
88
|
+
# href&.gsub!(']', ' ')
|
89
|
+
# href&.gsub!(',', ' ')
|
90
|
+
# href&.gsub!("'", ' ')
|
91
|
+
#
|
92
|
+
# flags = []
|
93
|
+
# flags << "over #{100}" if href.length > 100
|
94
|
+
# invalid_text = Regexp.new(/[0-9]/)
|
95
|
+
# flags << invalid_text&.match(href)
|
96
|
+
# href = href&.downcase
|
97
|
+
# href = href&.strip
|
98
|
+
#
|
99
|
+
# flags << @neg_hrefs.select { |red| href&.include?(red) }
|
100
|
+
# flags = flags.flatten.compact.uniq
|
101
|
+
# href_hsh[:valid_href] = href unless flags.any?
|
102
|
+
# href_hsh[:flags] = flags.join(', ')
|
103
|
+
# href_hsh
|
104
|
+
# end
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
data/lib/scrub_db.rb
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
require "scrub_db/version"
|
2
|
-
|
2
|
+
require 'scrub_db/web'
|
3
|
+
require 'scrub_db/filter'
|
3
4
|
require 'pry'
|
5
|
+
require 'crm_formatter'
|
4
6
|
|
5
7
|
module ScrubDb
|
6
8
|
|
7
9
|
def self.welcome
|
8
10
|
puts "Welcome to the gem!"
|
9
11
|
end
|
10
|
-
|
12
|
+
|
11
13
|
end
|
data/lib/web_criteria.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# WebCriteria.new.all_web_criteria
|
2
|
+
|
3
|
+
class WebCriteria
|
4
|
+
|
5
|
+
def self.all_web_criteria
|
6
|
+
{
|
7
|
+
neg_urls: seed_neg_urls,
|
8
|
+
pos_urls: seed_pos_urls,
|
9
|
+
neg_paths: seed_neg_paths,
|
10
|
+
pos_paths: seed_pos_paths,
|
11
|
+
neg_exts: seed_neg_exts,
|
12
|
+
pos_exts: seed_pos_exts
|
13
|
+
}
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.seed_neg_urls
|
17
|
+
%w(approv avis budget collis eat enterprise facebook financ food google gourmet hertz hotel hyatt insur invest loan lube mobility motel motorola parts quick rent repair restaur rv ryder service softwar travel twitter webhost yellowpages yelp youtube)
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.seed_pos_urls
|
21
|
+
["acura", "alfa romeo", "aston martin", "audi", "bmw", "bentley", "bugatti", "buick", "cdjr", "cadillac", "chevrolet", "chrysler", "dodge", "ferrari", "fiat", "ford", "gmc", "group", "group", "honda", "hummer", "hyundai", "infiniti", "isuzu", "jaguar", "jeep", "kia", "lamborghini", "lexus", "lincoln", "lotus", "mini", "maserati", "mazda", "mclaren", "mercedes-benz", "mitsubishi", "nissan", "porsche", "ram", "rolls-royce", "saab", "scion", "smart", "subaru", "suzuki", "toyota", "volkswagen", "volvo"]
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.seed_neg_paths
|
25
|
+
%w(: .biz .co .edu .gov .jpg .net // afri anounc book business buy bye call cash cheap click collis cont distrib download drop event face feature feed financ find fleet form gas generat graphic hello home hospi hour hours http info insta inventory item join login mail mailto mobile movie museu music news none offer part phone policy priva pump rate regist review schedul school service shop site test ticket tire tv twitter watch www yelp youth)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.seed_pos_paths
|
29
|
+
%w(team staff management)
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.seed_neg_exts
|
33
|
+
%w(au ca edu es gov in ru uk us)
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.seed_pos_exts
|
37
|
+
%w(com net)
|
38
|
+
end
|
39
|
+
|
40
|
+
# def self.seed_neg_hrefs
|
41
|
+
# %w(? .com .jpg @ * afri after anounc apply approved blog book business buy call care career cash charit cheap check click collis commerc cont contrib deal distrib download employ event face feature feed financ find fleet form gas generat golf here holiday hospi hour info insta inventory join later light login mail mobile movie museu music news none now oil part pay phone policy priva pump quick quote rate regist review saving schedul service shop sign site speci ticket tire today transla travel truck tv twitter watch youth)
|
42
|
+
# end
|
43
|
+
#
|
44
|
+
# def self.seed_pos_hrefs
|
45
|
+
# %w(team staff management)
|
46
|
+
# end
|
47
|
+
|
48
|
+
|
49
|
+
# ##Rails C: StartCrm.run_webs
|
50
|
+
# def self.get_urls
|
51
|
+
# urls = %w(approvedautosales.org autosmartfinance.com leessummitautorepair.net melodytoyota.com northeastacura.com gemmazda.com)
|
52
|
+
# urls += %w(website.com website.business.site website website.fake website.fake.com website.com.fake)
|
53
|
+
# end
|
54
|
+
|
55
|
+
end
|
data/scrub_db.gemspec
CHANGED
@@ -12,8 +12,8 @@ Gem::Specification.new do |spec|
|
|
12
12
|
spec.homepage = 'https://github.com/4rlm/scrub_db'
|
13
13
|
spec.license = "MIT"
|
14
14
|
|
15
|
-
spec.summary = %q{
|
16
|
-
spec.description = %q{
|
15
|
+
spec.summary = %q{Scrub data with your custom criteria. Returns detailed reporting.}
|
16
|
+
spec.description = %q{Scrub data with your custom criteria. Returns detailed reporting. Rspecs coming soon.}
|
17
17
|
|
18
18
|
if spec.respond_to?(:metadata)
|
19
19
|
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
@@ -39,7 +39,11 @@ Gem::Specification.new do |spec|
|
|
39
39
|
|
40
40
|
spec.required_ruby_version = '~> 2.5.1'
|
41
41
|
spec.add_dependency 'activesupport', '~> 5.2', '>= 5.2.0'
|
42
|
-
spec.add_dependency "activesupport-inflector", ['~> 0.1.0']
|
42
|
+
# spec.add_dependency "activesupport-inflector", ['~> 0.1.0']
|
43
|
+
|
44
|
+
spec.add_dependency "utf8_sanitizer", "~> 2.0"
|
45
|
+
spec.add_dependency "crm_formatter", "~> 2.4"
|
46
|
+
|
43
47
|
spec.add_development_dependency 'bundler', '~> 1.16', '>= 1.16.2'
|
44
48
|
spec.add_development_dependency 'byebug', '~> 10.0', '>= 10.0.2'
|
45
49
|
spec.add_development_dependency 'class_indexer', '~> 0.3.0'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrub_db
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.1.pre.rc.
|
4
|
+
version: 0.0.1.pre.rc.03
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Booth
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-06-
|
11
|
+
date: 2018-06-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -31,19 +31,33 @@ dependencies:
|
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 5.2.0
|
33
33
|
- !ruby/object:Gem::Dependency
|
34
|
-
name:
|
34
|
+
name: utf8_sanitizer
|
35
35
|
requirement: !ruby/object:Gem::Requirement
|
36
36
|
requirements:
|
37
37
|
- - "~>"
|
38
38
|
- !ruby/object:Gem::Version
|
39
|
-
version:
|
39
|
+
version: '2.0'
|
40
40
|
type: :runtime
|
41
41
|
prerelease: false
|
42
42
|
version_requirements: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
|
-
version:
|
46
|
+
version: '2.0'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: crm_formatter
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '2.4'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '2.4'
|
47
61
|
- !ruby/object:Gem::Dependency
|
48
62
|
name: bundler
|
49
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -194,7 +208,8 @@ dependencies:
|
|
194
208
|
- - "~>"
|
195
209
|
- !ruby/object:Gem::Version
|
196
210
|
version: 0.97.4
|
197
|
-
description:
|
211
|
+
description: Scrub data with your custom criteria. Returns detailed reporting. Rspecs
|
212
|
+
coming soon.
|
198
213
|
email:
|
199
214
|
- 4rlm@protonmail.ch
|
200
215
|
executables: []
|
@@ -212,7 +227,10 @@ files:
|
|
212
227
|
- bin/console
|
213
228
|
- bin/setup
|
214
229
|
- lib/scrub_db.rb
|
230
|
+
- lib/scrub_db/filter.rb
|
215
231
|
- lib/scrub_db/version.rb
|
232
|
+
- lib/scrub_db/web.rb
|
233
|
+
- lib/web_criteria.rb
|
216
234
|
- scrub_db.gemspec
|
217
235
|
homepage: https://github.com/4rlm/scrub_db
|
218
236
|
licenses:
|
@@ -238,5 +256,5 @@ rubyforge_project:
|
|
238
256
|
rubygems_version: 2.7.6
|
239
257
|
signing_key:
|
240
258
|
specification_version: 4
|
241
|
-
summary:
|
259
|
+
summary: Scrub data with your custom criteria. Returns detailed reporting.
|
242
260
|
test_files: []
|