scrub_db 0.0.1.pre.rc.03 → 2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec_status +4 -0
- data/README.md +415 -31
- data/Rakefile +71 -7
- data/junk.rb +114 -0
- data/lib/scrub_db/filter.rb +40 -21
- data/lib/scrub_db/strings.rb +52 -0
- data/lib/scrub_db/version.rb +1 -1
- data/lib/scrub_db/webs.rb +70 -0
- data/lib/scrub_db.rb +2 -1
- data/lib/{web_criteria.rb → webs_criteria.rb} +5 -5
- data/scrub_db.gemspec +3 -3
- metadata +20 -11
- data/lib/scrub_db/web.rb +0 -108
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e6140658a28c9843df6f94a6c5948df622edffeaa905bc37232b229e4ac30b62
|
4
|
+
data.tar.gz: 56858739677c75f755583d51e69db49634822671a4e0c2eb346ea537682674c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4a606e4be1bc35a3530ede12fa66af81970628c3fe7504f50045bac27a4933b1248e06e9f18362fe573d2b8b28cfd10a2e3b797e6a1795ed1f92711506ca4057
|
7
|
+
data.tar.gz: e6b10af82247ebebd24357a9c04393a1e16fbd9708cfba703cb5c764138a336fa8f63824b5d2e3f8c3337eeb9c7083739593006de5a2c7510c4ef41ed6b8902b
|
data/.rspec_status
ADDED
data/README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# ScrubDb
|
2
|
-
#### Scrub data
|
2
|
+
#### Scrub your database, api data, web scraping data, and web form submissions based on your your custom criteria. Allows for different criteria for different jobs. Returns detailed reporting to zero-in on your data with ease, efficiency, and greater insight. Allows for option to pre-format data before scrubbing to also normalize and standardize your data sets, ex uniform URL patterns
|
3
3
|
|
4
4
|
## Installation
|
5
5
|
|
@@ -19,17 +19,65 @@ Or install it yourself as:
|
|
19
19
|
|
20
20
|
## Usage
|
21
21
|
|
22
|
-
|
22
|
+
### I. Usage Overview
|
23
|
+
|
24
|
+
|
25
|
+
#### Step 1: Load Your Scrub Criteria:
|
26
|
+
|
27
|
+
##### 1. For String Criteria
|
28
|
+
```
|
29
|
+
strings_criteria = {
|
30
|
+
pos_criteria: %w[your positive criteria here],
|
31
|
+
neg_criteria: %w[your negative criteria here]
|
32
|
+
}
|
33
|
+
strings_obj = ScrubDb::Strings.new(strings_criteria)
|
34
|
+
```
|
35
|
+
|
36
|
+
##### 2. For Web Criteria
|
37
|
+
```
|
38
|
+
webs_criteria = {
|
39
|
+
pos_criteria: %w[your positive criteria here],
|
40
|
+
neg_criteria: %w[your negative criteria here]
|
41
|
+
}
|
42
|
+
webs_obj = ScrubDb::Webs.new(webs_criteria)
|
43
|
+
```
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
#### Step 2: Load Your Data to Scrub:
|
48
|
+
|
49
|
+
##### Methods available to scrub data:
|
50
|
+
|
51
|
+
##### 1. Scrub URLs:
|
52
|
+
```
|
53
|
+
scrub_web_obj = ScrubDb::Webs.new(criteria)
|
54
|
+
scrubbed_web_hashes = scrub_web_obj.scrub_urls(urls)
|
55
|
+
```
|
56
|
+
|
57
|
+
##### 2. Scrub Strings:
|
58
|
+
```
|
59
|
+
strings_obj = ScrubDb::Strings.new(strings_criteria)
|
60
|
+
scrubbed_strings = strings_obj.scrub_strings(array_of_strings)
|
61
|
+
```
|
62
|
+
|
63
|
+
##### 3. Scrub Proper Strings:
|
64
|
+
```
|
65
|
+
strings_obj = ScrubDb::Strings.new(strings_criteria)
|
66
|
+
scrubbed_prop_strings = strings_obj.scrub_proper_strings(array_of_props)
|
67
|
+
```
|
68
|
+
|
69
|
+
|
70
|
+
### II. Usage Details
|
23
71
|
|
24
72
|
### 1. Scrub Array of URLs:
|
25
73
|
This is an example of scrubbing auto dealership urls. We only want URLs based in the US, and paths of the staff. Most of our URLs are good, but we want to confirm that they all meet our requirements.
|
26
74
|
|
27
75
|
### A. Pass in Scrub Criteria
|
28
|
-
First step is to load your
|
76
|
+
First step is to load your Webs criteria in hash format. It's not required to enter all the keys below, but for those you are using, each key must be a symbol and be exactly the same as the ones below. The values must each be an array of strings.
|
29
77
|
|
30
78
|
```
|
31
79
|
criteria = {
|
32
|
-
neg_urls: %w[
|
80
|
+
neg_urls: %w[aprov avis budget collis eat],
|
33
81
|
pos_urls: %w[acura audi bmw bentley],
|
34
82
|
neg_paths: %w[buy bye call cash cheap click collis cont distrib],
|
35
83
|
pos_paths: %w[team staff management],
|
@@ -37,17 +85,18 @@ criteria = {
|
|
37
85
|
pos_exts: %w[com net]
|
38
86
|
}
|
39
87
|
|
40
|
-
|
88
|
+
scrub_web_obj = ScrubDb::Webs.new(criteria)
|
41
89
|
```
|
42
90
|
|
43
91
|
### B. Pass in URLs List
|
92
|
+
|
44
93
|
Next, pass your list of URLs to `scrub_urls(urls)` with the syntax below.
|
45
94
|
|
46
95
|
```
|
47
96
|
urls = %w[
|
97
|
+
austinchevrolet.not.real
|
48
98
|
smith_acura.com/staff
|
49
99
|
abcrepair.ca
|
50
|
-
austinchevrolet.not.real
|
51
100
|
hertzrentals.com/review
|
52
101
|
londonhyundai.uk/fleet
|
53
102
|
http://www.townbuick.net/staff
|
@@ -62,7 +111,7 @@ urls = %w[
|
|
62
111
|
www.www.yellowpages.com/business
|
63
112
|
]
|
64
113
|
|
65
|
-
scrubbed_web_hashes =
|
114
|
+
scrubbed_web_hashes = scrub_web_obj.scrub_urls(urls)
|
66
115
|
```
|
67
116
|
|
68
117
|
### C. Returned Results
|
@@ -71,7 +120,20 @@ Notice that the URLs in the list above are NOT uniformly formatted. ScrubDb lev
|
|
71
120
|
```
|
72
121
|
scrubbed_web_hashes = [
|
73
122
|
{
|
74
|
-
web_status: '
|
123
|
+
web_status: 'invalid',
|
124
|
+
url: 'austinchevrolet.not.real',
|
125
|
+
url_f: nil,
|
126
|
+
url_path: nil,
|
127
|
+
web_neg: 'error: ext.invalid [not, real]',
|
128
|
+
url_exts: [],
|
129
|
+
neg_exts: [],
|
130
|
+
pos_exts: [],
|
131
|
+
neg_paths: [],
|
132
|
+
pos_paths: [],
|
133
|
+
neg_urls: [],
|
134
|
+
pos_urls: []
|
135
|
+
},
|
136
|
+
{ web_status: 'formatted',
|
75
137
|
url: 'smith_acura.com/staff',
|
76
138
|
url_f: 'http://www.smith_acura.com',
|
77
139
|
url_path: '/staff',
|
@@ -84,8 +146,7 @@ scrubbed_web_hashes = [
|
|
84
146
|
neg_urls: [],
|
85
147
|
pos_urls: ['acura']
|
86
148
|
},
|
87
|
-
{
|
88
|
-
web_status: 'formatted',
|
149
|
+
{ web_status: 'formatted',
|
89
150
|
url: 'abcrepair.ca',
|
90
151
|
url_f: 'http://www.abcrepair.ca',
|
91
152
|
url_path: nil,
|
@@ -98,8 +159,7 @@ scrubbed_web_hashes = [
|
|
98
159
|
neg_urls: ['repair'],
|
99
160
|
pos_urls: []
|
100
161
|
},
|
101
|
-
{
|
102
|
-
web_status: 'formatted',
|
162
|
+
{ web_status: 'formatted',
|
103
163
|
url: 'hertzrentals.com/review',
|
104
164
|
url_f: 'http://www.hertzrentals.com',
|
105
165
|
url_path: '/review',
|
@@ -112,8 +172,7 @@ scrubbed_web_hashes = [
|
|
112
172
|
neg_urls: ['hertz, rent'],
|
113
173
|
pos_urls: []
|
114
174
|
},
|
115
|
-
{
|
116
|
-
web_status: 'formatted',
|
175
|
+
{ web_status: 'formatted',
|
117
176
|
url: 'londonhyundai.uk/fleet',
|
118
177
|
url_f: 'http://www.londonhyundai.uk',
|
119
178
|
url_path: '/fleet',
|
@@ -126,8 +185,7 @@ scrubbed_web_hashes = [
|
|
126
185
|
neg_urls: [],
|
127
186
|
pos_urls: ['hyundai']
|
128
187
|
},
|
129
|
-
{
|
130
|
-
web_status: 'formatted',
|
188
|
+
{ web_status: 'formatted',
|
131
189
|
url: 'http://www.townbuick.net/staff',
|
132
190
|
url_f: 'http://www.townbuick.net',
|
133
191
|
url_path: nil,
|
@@ -140,8 +198,7 @@ scrubbed_web_hashes = [
|
|
140
198
|
neg_urls: [],
|
141
199
|
pos_urls: ['buick']
|
142
200
|
},
|
143
|
-
{
|
144
|
-
web_status: 'formatted',
|
201
|
+
{ web_status: 'formatted',
|
145
202
|
url: 'http://youtube.com/download',
|
146
203
|
url_f: 'http://www.youtube.com',
|
147
204
|
url_path: nil,
|
@@ -154,8 +211,7 @@ scrubbed_web_hashes = [
|
|
154
211
|
neg_urls: ['youtube'],
|
155
212
|
pos_urls: []
|
156
213
|
},
|
157
|
-
{
|
158
|
-
web_status: 'formatted',
|
214
|
+
{ web_status: 'formatted',
|
159
215
|
url: 'www.madridinfiniti.es/collision',
|
160
216
|
url_f: 'http://www.madridinfiniti.es',
|
161
217
|
url_path: '/collision',
|
@@ -168,8 +224,20 @@ scrubbed_web_hashes = [
|
|
168
224
|
neg_urls: [],
|
169
225
|
pos_urls: ['infiniti']
|
170
226
|
},
|
171
|
-
{
|
172
|
-
|
227
|
+
{ web_status: 'invalid',
|
228
|
+
url: 'www.mitsubishideals.sofake',
|
229
|
+
url_f: nil,
|
230
|
+
url_path: nil,
|
231
|
+
web_neg: 'error: ext.invalid [sofake]',
|
232
|
+
url_exts: [],
|
233
|
+
neg_exts: [],
|
234
|
+
pos_exts: [],
|
235
|
+
neg_paths: [],
|
236
|
+
pos_paths: [],
|
237
|
+
neg_urls: [],
|
238
|
+
pos_urls: []
|
239
|
+
},
|
240
|
+
{ web_status: 'formatted',
|
173
241
|
url: 'www.dallassubaru.com.sofake',
|
174
242
|
url_f: 'http://www.dallassubaru.com',
|
175
243
|
url_path: nil,
|
@@ -182,8 +250,7 @@ scrubbed_web_hashes = [
|
|
182
250
|
neg_urls: [],
|
183
251
|
pos_urls: ['subaru']
|
184
252
|
},
|
185
|
-
{
|
186
|
-
web_status: 'formatted',
|
253
|
+
{ web_status: 'formatted',
|
187
254
|
url: 'www.quickeats.net/contact_us',
|
188
255
|
url_f: 'http://www.quickeats.net',
|
189
256
|
url_path: '/contact_us',
|
@@ -196,8 +263,7 @@ scrubbed_web_hashes = [
|
|
196
263
|
neg_urls: ['eat, quick'],
|
197
264
|
pos_urls: []
|
198
265
|
},
|
199
|
-
{
|
200
|
-
web_status: 'formatted',
|
266
|
+
{ web_status: 'formatted',
|
201
267
|
url: 'www.school.edu/teachers',
|
202
268
|
url_f: 'http://www.school.edu',
|
203
269
|
url_path: '/teachers',
|
@@ -210,8 +276,20 @@ scrubbed_web_hashes = [
|
|
210
276
|
neg_urls: [],
|
211
277
|
pos_urls: []
|
212
278
|
},
|
213
|
-
{
|
214
|
-
|
279
|
+
{ web_status: 'invalid',
|
280
|
+
url: 'www.www.nissancars/inventory',
|
281
|
+
url_f: nil,
|
282
|
+
url_path: nil,
|
283
|
+
web_neg: 'error: ext.none',
|
284
|
+
url_exts: [],
|
285
|
+
neg_exts: [],
|
286
|
+
pos_exts: [],
|
287
|
+
neg_paths: [],
|
288
|
+
pos_paths: [],
|
289
|
+
neg_urls: [],
|
290
|
+
pos_urls: []
|
291
|
+
},
|
292
|
+
{ web_status: 'formatted',
|
215
293
|
url: 'www.www.toyotatown.net/staff/management',
|
216
294
|
url_f: 'http://www.toyotatown.net',
|
217
295
|
url_path: '/staff/management',
|
@@ -220,12 +298,11 @@ scrubbed_web_hashes = [
|
|
220
298
|
neg_exts: [],
|
221
299
|
pos_exts: ['net'],
|
222
300
|
neg_paths: [],
|
223
|
-
pos_paths: ['
|
301
|
+
pos_paths: ['management, staff'],
|
224
302
|
neg_urls: [],
|
225
303
|
pos_urls: ['toyota']
|
226
304
|
},
|
227
|
-
{
|
228
|
-
web_status: 'formatted',
|
305
|
+
{ web_status: 'formatted',
|
229
306
|
url: 'www.www.yellowpages.com/business',
|
230
307
|
url_f: 'http://www.yellowpages.com',
|
231
308
|
url_path: '/business',
|
@@ -242,6 +319,313 @@ scrubbed_web_hashes = [
|
|
242
319
|
```
|
243
320
|
|
244
321
|
|
322
|
+
### 2. Scrub Array of Strings:
|
323
|
+
|
324
|
+
You can scrub an array of strings with or without formatting.
|
325
|
+
For scrubbing proper strings (account and business names, job titles, article titles, brands, locations, etc.) like below, you might prefer the proper scrub method, but these examples will use the same criteria and same array of strings to illustrate the difference.
|
326
|
+
|
327
|
+
Continuing with the auto dealership example above, the following examples are to scrub the auto dealership account names. We want to prioritize our data based on those who match our positive criteria, those who match our negative criteria, and those who are neutral.
|
328
|
+
|
329
|
+
### A. Pass in Scrub Criteria
|
330
|
+
First step is to load your Strings criteria in hash format. It's not required to enter all the keys below, but for those you are using, each key must be a symbol and be exactly the same as the ones below. The values must each be an array of strings.
|
331
|
+
|
332
|
+
```
|
333
|
+
strings_criteria = {
|
334
|
+
neg_urls: %w[aprov avis budget collis eat],
|
335
|
+
pos_urls: %w[acura audi bmw bentley],
|
336
|
+
neg_paths: %w[buy bye call cash cheap click collis cont distrib],
|
337
|
+
pos_paths: %w[team staff management],
|
338
|
+
neg_exts: %w[au ca edu es gov in ru uk us],
|
339
|
+
pos_exts: %w[com net]
|
340
|
+
}
|
341
|
+
|
342
|
+
strings_obj = ScrubDb::Strings.new(strings_criteria)
|
343
|
+
```
|
344
|
+
|
345
|
+
### B. Pass in Strings List
|
346
|
+
|
347
|
+
Next, pass your list of strings to `scrub_strings(strings)` with the syntax below.
|
348
|
+
|
349
|
+
```
|
350
|
+
array_of_strings = [
|
351
|
+
'quick auto approval, inc',
|
352
|
+
'the gmc and bmw-world of AUSTIN tx',
|
353
|
+
'DOWNTOWN CAR REPAIR, INC',
|
354
|
+
'TEXAS TRAVEL, CO',
|
355
|
+
'123 Car-world Kia OF CHICAGO IL',
|
356
|
+
'Main Street Ford in DALLAS tX',
|
357
|
+
'broad st fiat of houston',
|
358
|
+
'hot-deal auto insurance',
|
359
|
+
'BUDGET - AUTOMOTORES ZONA & FRANCA, INC',
|
360
|
+
'Young Gmc Trucks',
|
361
|
+
'youmans Chevrolet',
|
362
|
+
'yazell chevy',
|
363
|
+
'quick cAr LUBE',
|
364
|
+
'yAtEs AuTo maLL',
|
365
|
+
'YADKIN VALLEY COLLISION CO',
|
366
|
+
'XIT FORD INC'
|
367
|
+
]
|
368
|
+
|
369
|
+
scrubbed_strings = strings_obj.scrub_strings(array_of_strings)
|
370
|
+
```
|
371
|
+
|
372
|
+
### C. Returned Results
|
373
|
+
|
374
|
+
```
|
375
|
+
scrubbed_strings = [
|
376
|
+
{
|
377
|
+
string: 'quick auto approval, inc',
|
378
|
+
pos_criteria: [],
|
379
|
+
neg_criteria: ['approv, quick']
|
380
|
+
},
|
381
|
+
{
|
382
|
+
string: 'the gmc and bmw-world of AUSTIN tx',
|
383
|
+
pos_criteria: ['bmw, gmc'],
|
384
|
+
neg_criteria: []
|
385
|
+
},
|
386
|
+
{
|
387
|
+
string: 'DOWNTOWN CAR REPAIR, INC',
|
388
|
+
pos_criteria: [],
|
389
|
+
neg_criteria: ['repair']
|
390
|
+
},
|
391
|
+
{
|
392
|
+
string: 'TEXAS TRAVEL, CO',
|
393
|
+
pos_criteria: [],
|
394
|
+
neg_criteria: ['travel']
|
395
|
+
},
|
396
|
+
{
|
397
|
+
string: '123 Car-world Kia OF CHICAGO IL',
|
398
|
+
pos_criteria: ['kia'],
|
399
|
+
neg_criteria: []
|
400
|
+
},
|
401
|
+
{
|
402
|
+
string: 'Main Street Ford in DALLAS tX',
|
403
|
+
pos_criteria: ['ford'],
|
404
|
+
neg_criteria: []
|
405
|
+
},
|
406
|
+
{
|
407
|
+
string: 'broad st fiat of houston',
|
408
|
+
pos_criteria: ['fiat'],
|
409
|
+
neg_criteria: []
|
410
|
+
},
|
411
|
+
{
|
412
|
+
string: 'hot-deal auto insurance',
|
413
|
+
pos_criteria: [],
|
414
|
+
neg_criteria: ['insur']
|
415
|
+
},
|
416
|
+
{
|
417
|
+
string: 'BUDGET - AUTOMOTORES ZONA & FRANCA, INC',
|
418
|
+
pos_criteria: [],
|
419
|
+
neg_criteria: ['budget']
|
420
|
+
},
|
421
|
+
{
|
422
|
+
string: 'Young Gmc Trucks',
|
423
|
+
pos_criteria: ['gmc'],
|
424
|
+
neg_criteria: []
|
425
|
+
},
|
426
|
+
{
|
427
|
+
string: 'youmans Chevrolet',
|
428
|
+
pos_criteria: ['chevrolet'],
|
429
|
+
neg_criteria: []
|
430
|
+
},
|
431
|
+
{
|
432
|
+
string: 'yazell chevy',
|
433
|
+
pos_criteria: [],
|
434
|
+
neg_criteria: []
|
435
|
+
},
|
436
|
+
{
|
437
|
+
string: 'quick cAr LUBE',
|
438
|
+
pos_criteria: [],
|
439
|
+
neg_criteria: ['lube, quick']
|
440
|
+
},
|
441
|
+
{
|
442
|
+
string: 'yAtEs AuTo maLL',
|
443
|
+
pos_criteria: [],
|
444
|
+
neg_criteria: []
|
445
|
+
},
|
446
|
+
{
|
447
|
+
string: 'YADKIN VALLEY COLLISION CO',
|
448
|
+
pos_criteria: [],
|
449
|
+
neg_criteria: ['collis']
|
450
|
+
},
|
451
|
+
{
|
452
|
+
string: 'XIT FORD INC',
|
453
|
+
pos_criteria: ['ford'],
|
454
|
+
neg_criteria: []
|
455
|
+
}
|
456
|
+
]
|
457
|
+
```
|
458
|
+
|
459
|
+
|
460
|
+
### 3. Scrub Array of Proper Strings:
|
461
|
+
This method is designed for scrubbing proper strings, like account and business names, job titles, article titles, brands, locations, etc.
|
462
|
+
|
463
|
+
This method is identical to example 2 above (Scrub Array of Strings), except this method first formats the strings using the `Utf8Sanitizer gem` and `CrmFormatter gem`, then passes the results to the method above to scrub. So, this is a 2-in-1 method, Format + Scrub! Again, this method treats your strings as if they are proper nouns, so compare the results of these two methods to determine which is most suitable for your data.
|
464
|
+
|
465
|
+
### A. Pass in Scrub Criteria
|
466
|
+
First step is to load your Strings criteria in hash format. It's not required to enter all the keys below, but for those you are using, each key must be a symbol and be exactly the same as the ones below. The values must each be an array of strings.
|
467
|
+
|
468
|
+
```
|
469
|
+
strings_criteria = {
|
470
|
+
neg_urls: %w[aprov avis budget collis eat],
|
471
|
+
pos_urls: %w[acura audi bmw bentley],
|
472
|
+
neg_paths: %w[buy bye call cash cheap click collis cont distrib],
|
473
|
+
pos_paths: %w[team staff management],
|
474
|
+
neg_exts: %w[au ca edu es gov in ru uk us],
|
475
|
+
pos_exts: %w[com net]
|
476
|
+
}
|
477
|
+
|
478
|
+
strings_obj = ScrubDb::Strings.new(strings_criteria)
|
479
|
+
```
|
480
|
+
|
481
|
+
### B. Pass in Strings List
|
482
|
+
|
483
|
+
Next, pass your list of strings to `scrub_proper_strings(strings)` with the syntax below.
|
484
|
+
|
485
|
+
```
|
486
|
+
array_of_strings = [
|
487
|
+
'quick auto approval, inc',
|
488
|
+
'the gmc and bmw-world of AUSTIN tx',
|
489
|
+
'DOWNTOWN CAR REPAIR, INC',
|
490
|
+
'TEXAS TRAVEL, CO',
|
491
|
+
'123 Car-world Kia OF CHICAGO IL',
|
492
|
+
'Main Street Ford in DALLAS tX',
|
493
|
+
'broad st fiat of houston',
|
494
|
+
'hot-deal auto insurance',
|
495
|
+
'BUDGET - AUTOMOTORES ZONA & FRANCA, INC',
|
496
|
+
'Young Gmc Trucks',
|
497
|
+
'youmans Chevrolet',
|
498
|
+
'yazell chevy',
|
499
|
+
'quick cAr LUBE',
|
500
|
+
'yAtEs AuTo maLL',
|
501
|
+
'YADKIN VALLEY COLLISION CO',
|
502
|
+
'XIT FORD INC'
|
503
|
+
]
|
504
|
+
|
505
|
+
scrubbed_proper_strings = strings_obj.scrub_proper_strings(array_of_strings)
|
506
|
+
```
|
507
|
+
|
508
|
+
### C. Returned Results
|
509
|
+
|
510
|
+
```
|
511
|
+
scrubbed_proper_strings = [
|
512
|
+
{
|
513
|
+
proper_status: 'formatted',
|
514
|
+
proper: 'quick auto approval, inc',
|
515
|
+
proper_f: 'Quick Auto Approval, Inc',
|
516
|
+
pos_criteria: [],
|
517
|
+
neg_criteria: ['approv, quick']
|
518
|
+
},
|
519
|
+
{
|
520
|
+
proper_status: 'formatted',
|
521
|
+
proper: 'the gmc and bmw-world of AUSTIN tx',
|
522
|
+
proper_f: 'The GMC and BMW-World of Austin TX',
|
523
|
+
pos_criteria: ['bmw, gmc'],
|
524
|
+
neg_criteria: []
|
525
|
+
},
|
526
|
+
{
|
527
|
+
proper_status: 'formatted',
|
528
|
+
proper: 'DOWNTOWN CAR REPAIR, INC',
|
529
|
+
proper_f: 'Downtown Car Repair, Inc',
|
530
|
+
pos_criteria: [],
|
531
|
+
neg_criteria: ['repair']
|
532
|
+
},
|
533
|
+
{
|
534
|
+
proper_status: 'formatted',
|
535
|
+
proper: 'TEXAS TRAVEL, CO',
|
536
|
+
proper_f: 'Texas Travel, Co',
|
537
|
+
pos_criteria: [],
|
538
|
+
neg_criteria: ['travel']
|
539
|
+
},
|
540
|
+
{
|
541
|
+
proper_status: 'formatted',
|
542
|
+
proper: '123 Car-world Kia OF CHICAGO IL',
|
543
|
+
proper_f: '123 Car-World Kia of Chicago IL',
|
544
|
+
pos_criteria: ['kia'],
|
545
|
+
neg_criteria: []
|
546
|
+
},
|
547
|
+
{
|
548
|
+
proper_status: 'formatted',
|
549
|
+
proper: 'Main Street Ford in DALLAS tX',
|
550
|
+
proper_f: 'Main Street Ford in Dallas TX',
|
551
|
+
pos_criteria: ['ford'],
|
552
|
+
neg_criteria: []
|
553
|
+
},
|
554
|
+
{
|
555
|
+
proper_status: 'formatted',
|
556
|
+
proper: 'broad st fiat of houston',
|
557
|
+
proper_f: 'Broad St Fiat of Houston',
|
558
|
+
pos_criteria: ['fiat'],
|
559
|
+
neg_criteria: []
|
560
|
+
},
|
561
|
+
{
|
562
|
+
proper_status: 'formatted',
|
563
|
+
proper: 'hot-deal auto insurance',
|
564
|
+
proper_f: 'Hot-Deal Auto Insurance',
|
565
|
+
pos_criteria: [],
|
566
|
+
neg_criteria: ['insur']
|
567
|
+
},
|
568
|
+
{
|
569
|
+
proper_status: 'formatted',
|
570
|
+
proper: 'BUDGET - AUTOMOTORES ZONA & FRANCA, INC',
|
571
|
+
proper_f: 'Budget - Automotores Zona & Franca, Inc',
|
572
|
+
pos_criteria: [],
|
573
|
+
neg_criteria: ['budget']
|
574
|
+
},
|
575
|
+
{
|
576
|
+
proper_status: 'formatted',
|
577
|
+
proper: 'Young Gmc Trucks',
|
578
|
+
proper_f: 'Young GMC Trucks',
|
579
|
+
pos_criteria: ['gmc'],
|
580
|
+
neg_criteria: []
|
581
|
+
},
|
582
|
+
{
|
583
|
+
proper_status: 'formatted',
|
584
|
+
proper: 'youmans Chevrolet',
|
585
|
+
proper_f: 'Youmans Chevrolet',
|
586
|
+
pos_criteria: ['chevrolet'],
|
587
|
+
neg_criteria: []
|
588
|
+
},
|
589
|
+
{
|
590
|
+
proper_status: 'formatted',
|
591
|
+
proper: 'yazell chevy',
|
592
|
+
proper_f: 'Yazell Chevy',
|
593
|
+
pos_criteria: [],
|
594
|
+
neg_criteria: []
|
595
|
+
},
|
596
|
+
{
|
597
|
+
proper_status: 'formatted',
|
598
|
+
proper: 'quick cAr LUBE',
|
599
|
+
proper_f: 'Quick Car Lube',
|
600
|
+
pos_criteria: [],
|
601
|
+
neg_criteria: ['lube, quick']
|
602
|
+
},
|
603
|
+
{
|
604
|
+
proper_status: 'formatted',
|
605
|
+
proper: 'yAtEs AuTo maLL',
|
606
|
+
proper_f: 'Yates Auto Mall',
|
607
|
+
pos_criteria: [],
|
608
|
+
neg_criteria: []
|
609
|
+
},
|
610
|
+
{
|
611
|
+
proper_status: 'formatted',
|
612
|
+
proper: 'YADKIN VALLEY COLLISION CO',
|
613
|
+
proper_f: 'Yadkin Valley Collision Co',
|
614
|
+
pos_criteria: [],
|
615
|
+
neg_criteria: ['collis']
|
616
|
+
},
|
617
|
+
{
|
618
|
+
proper_status: 'formatted',
|
619
|
+
proper: 'XIT FORD INC',
|
620
|
+
proper_f: 'Xit Ford Inc',
|
621
|
+
pos_criteria: ['ford'],
|
622
|
+
neg_criteria: []
|
623
|
+
}
|
624
|
+
]
|
625
|
+
|
626
|
+
```
|
627
|
+
|
628
|
+
|
245
629
|
## Author
|
246
630
|
|
247
631
|
Adam J Booth - [4rlm](https://github.com/4rlm)
|
data/Rakefile
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
2
|
require "rspec/core/rake_task"
|
3
3
|
require 'scrub_db'
|
4
|
-
require '
|
4
|
+
require 'webs_criteria'
|
5
5
|
|
6
6
|
|
7
7
|
RSpec::Core::RakeTask.new(:spec)
|
@@ -17,17 +17,81 @@ task :console do
|
|
17
17
|
require "active_support/all"
|
18
18
|
ARGV.clear
|
19
19
|
|
20
|
-
|
21
|
-
|
20
|
+
scrubbed_webs = run_scrub_webs
|
21
|
+
# scrubbed_strings = run_scrub_strings
|
22
|
+
# scrubbed_proper_strings = run_scrub_proper_strings
|
23
|
+
# binding.pry
|
22
24
|
|
23
25
|
IRB.start
|
24
26
|
end
|
25
27
|
|
26
|
-
|
28
|
+
|
29
|
+
def run_scrub_strings
|
30
|
+
strings_criteria = {
|
31
|
+
pos_criteria: WebsCriteria.seed_pos_urls,
|
32
|
+
neg_criteria: WebsCriteria.seed_neg_urls
|
33
|
+
}
|
34
|
+
|
35
|
+
array_of_strings = [
|
36
|
+
'quick auto approval, inc',
|
37
|
+
'the gmc and bmw-world of AUSTIN tx',
|
38
|
+
'DOWNTOWN CAR REPAIR, INC',
|
39
|
+
'TEXAS TRAVEL, CO',
|
40
|
+
'123 Car-world Kia OF CHICAGO IL',
|
41
|
+
'Main Street Ford in DALLAS tX',
|
42
|
+
'broad st fiat of houston',
|
43
|
+
'hot-deal auto insurance',
|
44
|
+
'BUDGET - AUTOMOTORES ZONA & FRANCA, INC',
|
45
|
+
'Young Gmc Trucks',
|
46
|
+
'youmans Chevrolet',
|
47
|
+
'yazell chevy',
|
48
|
+
'quick cAr LUBE',
|
49
|
+
'yAtEs AuTo maLL',
|
50
|
+
'YADKIN VALLEY COLLISION CO',
|
51
|
+
'XIT FORD INC'
|
52
|
+
]
|
53
|
+
|
54
|
+
strings_obj = ScrubDb::Strings.new(strings_criteria)
|
55
|
+
scrubbed_strings = strings_obj.scrub_strings(array_of_strings)
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
def run_scrub_proper_strings
|
60
|
+
strings_criteria = {
|
61
|
+
pos_criteria: WebsCriteria.seed_pos_urls,
|
62
|
+
neg_criteria: WebsCriteria.seed_neg_urls
|
63
|
+
}
|
64
|
+
|
65
|
+
array_of_propers = [
|
66
|
+
'quick auto approval, inc',
|
67
|
+
'the gmc and bmw-world of AUSTIN tx',
|
68
|
+
'DOWNTOWN CAR REPAIR, INC',
|
69
|
+
'TEXAS TRAVEL, CO',
|
70
|
+
'123 Car-world Kia OF CHICAGO IL',
|
71
|
+
'Main Street Ford in DALLAS tX',
|
72
|
+
'broad st fiat of houston',
|
73
|
+
'hot-deal auto insurance',
|
74
|
+
'BUDGET - AUTOMOTORES ZONA & FRANCA, INC',
|
75
|
+
'Young Gmc Trucks',
|
76
|
+
'youmans Chevrolet',
|
77
|
+
'yazell chevy',
|
78
|
+
'quick cAr LUBE',
|
79
|
+
'yAtEs AuTo maLL',
|
80
|
+
'YADKIN VALLEY COLLISION CO',
|
81
|
+
'XIT FORD INC'
|
82
|
+
]
|
83
|
+
|
84
|
+
strings_obj = ScrubDb::Strings.new(strings_criteria)
|
85
|
+
scrubbed_proper_strings = strings_obj.scrub_proper_strings(array_of_propers)
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
def run_scrub_webs
|
27
91
|
urls = %w[
|
92
|
+
austinchevrolet.not.real
|
28
93
|
smith_acura.com/staff
|
29
94
|
abcrepair.ca
|
30
|
-
austinchevrolet.not.real
|
31
95
|
hertzrentals.com/review
|
32
96
|
londonhyundai.uk/fleet
|
33
97
|
http://www.townbuick.net/staff
|
@@ -42,6 +106,6 @@ def scrub_sample_urls
|
|
42
106
|
www.www.yellowpages.com/business
|
43
107
|
]
|
44
108
|
|
45
|
-
|
46
|
-
scrubbed_webs =
|
109
|
+
webs_obj = ScrubDb::Webs.new(WebsCriteria.all_scrub_web_criteria)
|
110
|
+
scrubbed_webs = webs_obj.scrub_urls(urls)
|
47
111
|
end
|
data/junk.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
[
|
2
|
+
{
|
3
|
+
proper_status: 'formatted',
|
4
|
+
proper: 'quick auto approval, inc',
|
5
|
+
proper_f: 'Quick Auto Approval, Inc',
|
6
|
+
pos_criteria: [],
|
7
|
+
neg_criteria: ['approv, quick']
|
8
|
+
},
|
9
|
+
{
|
10
|
+
proper_status: 'formatted',
|
11
|
+
proper: 'the gmc and bmw-world of AUSTIN tx',
|
12
|
+
proper_f: 'The GMC and BMW-World of Austin TX',
|
13
|
+
pos_criteria: ['bmw, gmc'],
|
14
|
+
neg_criteria: []
|
15
|
+
},
|
16
|
+
{
|
17
|
+
proper_status: 'formatted',
|
18
|
+
proper: 'DOWNTOWN CAR REPAIR, INC',
|
19
|
+
proper_f: 'Downtown Car Repair, Inc',
|
20
|
+
pos_criteria: [],
|
21
|
+
neg_criteria: ['repair']
|
22
|
+
},
|
23
|
+
{
|
24
|
+
proper_status: 'formatted',
|
25
|
+
proper: 'TEXAS TRAVEL, CO',
|
26
|
+
proper_f: 'Texas Travel, Co',
|
27
|
+
pos_criteria: [],
|
28
|
+
neg_criteria: ['travel']
|
29
|
+
},
|
30
|
+
{
|
31
|
+
proper_status: 'formatted',
|
32
|
+
proper: '123 Car-world Kia OF CHICAGO IL',
|
33
|
+
proper_f: '123 Car-World Kia of Chicago IL',
|
34
|
+
pos_criteria: ['kia'],
|
35
|
+
neg_criteria: []
|
36
|
+
},
|
37
|
+
{
|
38
|
+
proper_status: 'formatted',
|
39
|
+
proper: 'Main Street Ford in DALLAS tX',
|
40
|
+
proper_f: 'Main Street Ford in Dallas TX',
|
41
|
+
pos_criteria: ['ford'],
|
42
|
+
neg_criteria: []
|
43
|
+
},
|
44
|
+
{
|
45
|
+
proper_status: 'formatted',
|
46
|
+
proper: 'broad st fiat of houston',
|
47
|
+
proper_f: 'Broad St Fiat of Houston',
|
48
|
+
pos_criteria: ['fiat'],
|
49
|
+
neg_criteria: []
|
50
|
+
},
|
51
|
+
{
|
52
|
+
proper_status: 'formatted',
|
53
|
+
proper: 'hot-deal auto insurance',
|
54
|
+
proper_f: 'Hot-Deal Auto Insurance',
|
55
|
+
pos_criteria: [],
|
56
|
+
neg_criteria: ['insur']
|
57
|
+
},
|
58
|
+
{
|
59
|
+
proper_status: 'formatted',
|
60
|
+
proper: 'BUDGET - AUTOMOTORES ZONA & FRANCA, INC',
|
61
|
+
proper_f: 'Budget - Automotores Zona & Franca, Inc',
|
62
|
+
pos_criteria: [],
|
63
|
+
neg_criteria: ['budget']
|
64
|
+
},
|
65
|
+
{
|
66
|
+
proper_status: 'formatted',
|
67
|
+
proper: 'Young Gmc Trucks',
|
68
|
+
proper_f: 'Young GMC Trucks',
|
69
|
+
pos_criteria: ['gmc'],
|
70
|
+
neg_criteria: []
|
71
|
+
},
|
72
|
+
{
|
73
|
+
proper_status: 'formatted',
|
74
|
+
proper: 'youmans Chevrolet',
|
75
|
+
proper_f: 'Youmans Chevrolet',
|
76
|
+
pos_criteria: ['chevrolet'],
|
77
|
+
neg_criteria: []
|
78
|
+
},
|
79
|
+
{
|
80
|
+
proper_status: 'formatted',
|
81
|
+
proper: 'yazell chevy',
|
82
|
+
proper_f: 'Yazell Chevy',
|
83
|
+
pos_criteria: [],
|
84
|
+
neg_criteria: []
|
85
|
+
},
|
86
|
+
{
|
87
|
+
proper_status: 'formatted',
|
88
|
+
proper: 'quick cAr LUBE',
|
89
|
+
proper_f: 'Quick Car Lube',
|
90
|
+
pos_criteria: [],
|
91
|
+
neg_criteria: ['lube, quick']
|
92
|
+
},
|
93
|
+
{
|
94
|
+
proper_status: 'formatted',
|
95
|
+
proper: 'yAtEs AuTo maLL',
|
96
|
+
proper_f: 'Yates Auto Mall',
|
97
|
+
pos_criteria: [],
|
98
|
+
neg_criteria: []
|
99
|
+
},
|
100
|
+
{
|
101
|
+
proper_status: 'formatted',
|
102
|
+
proper: 'YADKIN VALLEY COLLISION CO',
|
103
|
+
proper_f: 'Yadkin Valley Collision Co',
|
104
|
+
pos_criteria: [],
|
105
|
+
neg_criteria: ['collis']
|
106
|
+
},
|
107
|
+
{
|
108
|
+
proper_status: 'formatted',
|
109
|
+
proper: 'XIT FORD INC',
|
110
|
+
proper_f: 'Xit Ford Inc',
|
111
|
+
pos_criteria: ['ford'],
|
112
|
+
neg_criteria: []
|
113
|
+
}
|
114
|
+
]
|
data/lib/scrub_db/filter.rb
CHANGED
@@ -5,47 +5,66 @@ module ScrubDb
|
|
5
5
|
|
6
6
|
def initialize(args={})
|
7
7
|
@args = args
|
8
|
-
# @global_hash = grab_global_hash
|
9
8
|
@empty_criteria = args.empty?
|
10
9
|
end
|
11
10
|
|
12
11
|
def scrub_oa(hash, target, oa_name, include_or_equal)
|
13
12
|
return hash unless oa_name.present? && !@empty_criteria && target.present?
|
14
|
-
criteria =
|
13
|
+
criteria = fetch_criteria(oa_name)
|
15
14
|
|
16
15
|
return hash unless criteria.any?
|
17
|
-
|
18
|
-
|
16
|
+
target = prep_target(target)
|
17
|
+
tars = target_to_tars(target)
|
18
|
+
scrub_matches = match_criteria(tars, include_or_equal, criteria)
|
19
|
+
string_match = stringify_matches(scrub_matches)
|
20
|
+
hash = match_to_hash(hash, string_match, oa_name)
|
21
|
+
end
|
22
|
+
|
23
|
+
def match_to_hash(hsh, match, oa_name)
|
24
|
+
return hsh unless match.present?
|
25
|
+
hsh[oa_name.to_sym] << match
|
26
|
+
hsh
|
27
|
+
end
|
19
28
|
|
29
|
+
def stringify_matches(matches=[])
|
30
|
+
string_match = matches&.uniq&.sort&.join(', ') if matches.any?
|
31
|
+
end
|
32
|
+
|
33
|
+
def fetch_criteria(oa_name)
|
34
|
+
criteria = @args.fetch(oa_name.to_sym, [])
|
35
|
+
criteria = criteria&.map(&:downcase)
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
def match_criteria(tars, include_or_equal, criteria)
|
20
40
|
scrub_matches = tars.map do |tar|
|
21
|
-
return hash unless criteria.present?
|
22
41
|
if include_or_equal == 'include'
|
23
|
-
criteria.
|
42
|
+
criteria.map { |crit| crit if tar.include?(crit) }
|
24
43
|
elsif include_or_equal == 'equal'
|
25
|
-
criteria.
|
44
|
+
criteria.map { |crit| crit if tar == crit }
|
26
45
|
end
|
27
46
|
end
|
47
|
+
scrub_matches = scrub_matches.flatten.compact
|
48
|
+
end
|
28
49
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
50
|
+
def prep_target(target)
|
51
|
+
target = target.join if target.is_a?(Array)
|
52
|
+
target = target.downcase
|
53
|
+
target = target.gsub(',', ' ')
|
54
|
+
target = target.gsub('-', ' ')
|
55
|
+
target = target.squeeze(' ')
|
56
|
+
end
|
34
57
|
|
35
|
-
|
36
|
-
|
37
|
-
# return hash unless scrub_match.present?
|
38
|
-
# if oa_name.include?('web_neg')
|
39
|
-
# hash[:web_neg] << "#{oa_name}: #{scrub_match}"
|
40
|
-
# else
|
41
|
-
# hash[:web_pos] << "#{oa_name}: #{scrub_match}"
|
42
|
-
# end
|
58
|
+
def target_to_tars(target)
|
59
|
+
tars = target.is_a?(::String) ? target.split(' ') : target
|
43
60
|
end
|
61
|
+
|
62
|
+
|
44
63
|
######################################
|
45
64
|
|
46
65
|
|
47
66
|
# def grab_global_hash
|
48
|
-
# keys = %i[row_id act_name street city state zip full_addr phone url street_f city_f state_f zip_f full_addr_f phone_f url_f url_path
|
67
|
+
# keys = %i[row_id act_name street city state zip full_addr phone url street_f city_f state_f zip_f full_addr_f phone_f url_f url_path ScrubWeb_neg address_status phone_status ScrubWeb_status utf_status]
|
49
68
|
# @global_hash = Hash[keys.map { |a| [a, nil] }]
|
50
69
|
# end
|
51
70
|
|
@@ -0,0 +1,52 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module ScrubDb
|
4
|
+
class Strings
|
5
|
+
# attr_accessor :headers, :valid_rows, :encoded_rows, :row_id, :data_hash, :defective_rows, :error_rows
|
6
|
+
|
7
|
+
def initialize(criteria={})
|
8
|
+
@empty_criteria = criteria&.empty?
|
9
|
+
@filter = ScrubDb::Filter.new(criteria) unless @empty_criteria
|
10
|
+
end
|
11
|
+
|
12
|
+
def scrub_proper_strings(props=[])
|
13
|
+
prop_hashes = CrmFormatter.format_propers(props)
|
14
|
+
prop_hashes = merge_criteria(prop_hashes)
|
15
|
+
prop_hashes.map! { |prop_hsh| scrub_hash(prop_hsh) }
|
16
|
+
end
|
17
|
+
|
18
|
+
def scrub_strings(strings=[])
|
19
|
+
str_hashes = strings_to_hashes(strings)
|
20
|
+
str_hashes = merge_criteria(str_hashes)
|
21
|
+
str_hashes.map! { |str_hsh| scrub_hash(str_hsh) }
|
22
|
+
end
|
23
|
+
|
24
|
+
def strings_to_hashes(strings)
|
25
|
+
str_hashes = strings.map { |str| { string: str } }
|
26
|
+
end
|
27
|
+
|
28
|
+
def merge_criteria(hashes)
|
29
|
+
hashes.map do |hsh|
|
30
|
+
hsh.merge({ pos_criteria: [], neg_criteria: [] })
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def scrub_hash(hsh)
|
35
|
+
str = hsh[:string]
|
36
|
+
prop = hsh[:proper_f]
|
37
|
+
|
38
|
+
if str.present?
|
39
|
+
hsh = @filter.scrub_oa(hsh, str, 'neg_criteria', 'include')
|
40
|
+
hsh = @filter.scrub_oa(hsh, str, 'pos_criteria', 'include')
|
41
|
+
end
|
42
|
+
|
43
|
+
if prop.present?
|
44
|
+
hsh = @filter.scrub_oa(hsh, prop, 'neg_criteria', 'include')
|
45
|
+
hsh = @filter.scrub_oa(hsh, prop, 'pos_criteria', 'include')
|
46
|
+
end
|
47
|
+
hsh
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
data/lib/scrub_db/version.rb
CHANGED
@@ -0,0 +1,70 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module ScrubDb
|
4
|
+
class Webs
|
5
|
+
# attr_accessor :headers, :valid_rows, :encoded_rows, :row_id, :data_hash, :defective_rows, :error_rows
|
6
|
+
|
7
|
+
def initialize(criteria={})
|
8
|
+
@empty_criteria = criteria&.empty?
|
9
|
+
@filter = ScrubDb::Filter.new(criteria) unless @empty_criteria
|
10
|
+
end
|
11
|
+
|
12
|
+
def scrub_urls(urls=[])
|
13
|
+
formatted_url_hashes = CrmFormatter.format_urls(urls)
|
14
|
+
formatted_url_hashes = merge_criteria_hashes(formatted_url_hashes)
|
15
|
+
formatted_url_hashes = pre_scrub(formatted_url_hashes)
|
16
|
+
end
|
17
|
+
|
18
|
+
def pre_scrub(hashes)
|
19
|
+
hashes = hashes.map do |hsh|
|
20
|
+
if hsh[:url_f].present?
|
21
|
+
hsh[:url_exts] = extract_exts(hsh)
|
22
|
+
hsh = scrub_url_hash(hsh)
|
23
|
+
end
|
24
|
+
hsh
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def merge_criteria_hashes(hashes)
|
29
|
+
hashes.map! do |url_hash|
|
30
|
+
merge_criteria_hash(url_hash)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def merge_criteria_hash(url_hash)
|
35
|
+
url_hash.merge!(
|
36
|
+
{
|
37
|
+
url_exts: [],
|
38
|
+
neg_exts: [],
|
39
|
+
pos_exts: [],
|
40
|
+
neg_paths: [],
|
41
|
+
pos_paths: [],
|
42
|
+
neg_urls: [],
|
43
|
+
pos_urls: []
|
44
|
+
}
|
45
|
+
)
|
46
|
+
end
|
47
|
+
|
48
|
+
def extract_exts(url_hash)
|
49
|
+
uri_parts = URI(url_hash[:url_f]).host&.split('.')
|
50
|
+
url_exts = uri_parts[2..-1]
|
51
|
+
end
|
52
|
+
|
53
|
+
def scrub_url_hash(url_hash)
|
54
|
+
url = url_hash[:url_f]
|
55
|
+
path = url_hash[:url_path]
|
56
|
+
href = url_hash[:href]
|
57
|
+
url_exts = url_hash[:url_exts]
|
58
|
+
|
59
|
+
url_hash = @filter.scrub_oa(url_hash, url_exts, 'neg_exts', 'equal')
|
60
|
+
url_hash = @filter.scrub_oa(url_hash, url_exts, 'pos_exts', 'equal')
|
61
|
+
url_hash = @filter.scrub_oa(url_hash, url, 'neg_urls', 'include')
|
62
|
+
url_hash = @filter.scrub_oa(url_hash, url, 'pos_urls', 'include')
|
63
|
+
url_hash = @filter.scrub_oa(url_hash, path, 'neg_paths', 'include')
|
64
|
+
url_hash = @filter.scrub_oa(url_hash, path, 'pos_paths', 'include')
|
65
|
+
url_hash
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
data/lib/scrub_db.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
#
|
1
|
+
# WebsCriteria.new.all_scrub_web_criteria
|
2
2
|
|
3
|
-
class
|
3
|
+
class WebsCriteria
|
4
4
|
|
5
|
-
def self.
|
5
|
+
def self.all_scrub_web_criteria
|
6
6
|
{
|
7
7
|
neg_urls: seed_neg_urls,
|
8
8
|
pos_urls: seed_pos_urls,
|
@@ -46,10 +46,10 @@ class WebCriteria
|
|
46
46
|
# end
|
47
47
|
|
48
48
|
|
49
|
-
# ##Rails C: StartCrm.
|
49
|
+
# ##Rails C: StartCrm.run_scrub_webs
|
50
50
|
# def self.get_urls
|
51
51
|
# urls = %w(approvedautosales.org autosmartfinance.com leessummitautorepair.net melodytoyota.com northeastacura.com gemmazda.com)
|
52
|
-
# urls += %w(
|
52
|
+
# urls += %w(Scrubwebsite.com Scrubwebsite.business.site Scrubwebsite Scrubwebsite.fake Scrubwebsite.fake.com Scrubwebsite.com.fake)
|
53
53
|
# end
|
54
54
|
|
55
55
|
end
|
data/scrub_db.gemspec
CHANGED
@@ -12,8 +12,8 @@ Gem::Specification.new do |spec|
|
|
12
12
|
spec.homepage = 'https://github.com/4rlm/scrub_db'
|
13
13
|
spec.license = "MIT"
|
14
14
|
|
15
|
-
spec.summary = %q{Scrub data
|
16
|
-
spec.description = %q{Scrub data
|
15
|
+
spec.summary = %q{Scrub your database, api data, web scraping data, and web form submissions based on your your custom criteria. Allows for different criteria for different jobs. Returns detailed reporting to zero-in on your data with ease, efficiency, and greater insight.}
|
16
|
+
spec.description = %q{Scrub your database, api data, web scraping data, and web form submissions based on your your custom criteria. Allows for different criteria for different jobs. Returns detailed reporting to zero-in on your data with ease, efficiency, and greater insight. Allows for option to pre-format data before scrubbing to also normalize and standardize your data sets, ex uniform URL patterns}
|
17
17
|
|
18
18
|
if spec.respond_to?(:metadata)
|
19
19
|
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
@@ -42,7 +42,7 @@ Gem::Specification.new do |spec|
|
|
42
42
|
# spec.add_dependency "activesupport-inflector", ['~> 0.1.0']
|
43
43
|
|
44
44
|
spec.add_dependency "utf8_sanitizer", "~> 2.0"
|
45
|
-
spec.add_dependency "crm_formatter", "~> 2.
|
45
|
+
spec.add_dependency "crm_formatter", "~> 2.6"
|
46
46
|
|
47
47
|
spec.add_development_dependency 'bundler', '~> 1.16', '>= 1.16.2'
|
48
48
|
spec.add_development_dependency 'byebug', '~> 10.0', '>= 10.0.2'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrub_db
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: '2.0'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Booth
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-06-
|
11
|
+
date: 2018-06-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -50,14 +50,14 @@ dependencies:
|
|
50
50
|
requirements:
|
51
51
|
- - "~>"
|
52
52
|
- !ruby/object:Gem::Version
|
53
|
-
version: '2.
|
53
|
+
version: '2.6'
|
54
54
|
type: :runtime
|
55
55
|
prerelease: false
|
56
56
|
version_requirements: !ruby/object:Gem::Requirement
|
57
57
|
requirements:
|
58
58
|
- - "~>"
|
59
59
|
- !ruby/object:Gem::Version
|
60
|
-
version: '2.
|
60
|
+
version: '2.6'
|
61
61
|
- !ruby/object:Gem::Dependency
|
62
62
|
name: bundler
|
63
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -208,8 +208,11 @@ dependencies:
|
|
208
208
|
- - "~>"
|
209
209
|
- !ruby/object:Gem::Version
|
210
210
|
version: 0.97.4
|
211
|
-
description: Scrub data
|
212
|
-
|
211
|
+
description: Scrub your database, api data, web scraping data, and web form submissions
|
212
|
+
based on your your custom criteria. Allows for different criteria for different
|
213
|
+
jobs. Returns detailed reporting to zero-in on your data with ease, efficiency,
|
214
|
+
and greater insight. Allows for option to pre-format data before scrubbing to also
|
215
|
+
normalize and standardize your data sets, ex uniform URL patterns
|
213
216
|
email:
|
214
217
|
- 4rlm@protonmail.ch
|
215
218
|
executables: []
|
@@ -218,6 +221,7 @@ extra_rdoc_files: []
|
|
218
221
|
files:
|
219
222
|
- ".gitignore"
|
220
223
|
- ".rspec"
|
224
|
+
- ".rspec_status"
|
221
225
|
- ".travis.yml"
|
222
226
|
- CODE_OF_CONDUCT.md
|
223
227
|
- Gemfile
|
@@ -226,11 +230,13 @@ files:
|
|
226
230
|
- Rakefile
|
227
231
|
- bin/console
|
228
232
|
- bin/setup
|
233
|
+
- junk.rb
|
229
234
|
- lib/scrub_db.rb
|
230
235
|
- lib/scrub_db/filter.rb
|
236
|
+
- lib/scrub_db/strings.rb
|
231
237
|
- lib/scrub_db/version.rb
|
232
|
-
- lib/scrub_db/
|
233
|
-
- lib/
|
238
|
+
- lib/scrub_db/webs.rb
|
239
|
+
- lib/webs_criteria.rb
|
234
240
|
- scrub_db.gemspec
|
235
241
|
homepage: https://github.com/4rlm/scrub_db
|
236
242
|
licenses:
|
@@ -248,13 +254,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
248
254
|
version: 2.5.1
|
249
255
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
250
256
|
requirements:
|
251
|
-
- - "
|
257
|
+
- - ">="
|
252
258
|
- !ruby/object:Gem::Version
|
253
|
-
version:
|
259
|
+
version: '0'
|
254
260
|
requirements: []
|
255
261
|
rubyforge_project:
|
256
262
|
rubygems_version: 2.7.6
|
257
263
|
signing_key:
|
258
264
|
specification_version: 4
|
259
|
-
summary: Scrub data
|
265
|
+
summary: Scrub your database, api data, web scraping data, and web form submissions
|
266
|
+
based on your your custom criteria. Allows for different criteria for different
|
267
|
+
jobs. Returns detailed reporting to zero-in on your data with ease, efficiency,
|
268
|
+
and greater insight.
|
260
269
|
test_files: []
|
data/lib/scrub_db/web.rb
DELETED
@@ -1,108 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
module ScrubDb
|
4
|
-
class Web
|
5
|
-
# attr_accessor :headers, :valid_rows, :encoded_rows, :row_id, :data_hash, :defective_rows, :error_rows
|
6
|
-
|
7
|
-
def initialize(criteria={})
|
8
|
-
@empty_criteria = criteria&.empty?
|
9
|
-
@filter = ScrubDb::Filter.new(criteria) unless @empty_criteria
|
10
|
-
end
|
11
|
-
|
12
|
-
def scrub_urls(urls=[])
|
13
|
-
formatted_url_hashes = CrmFormatter.format_urls(urls)
|
14
|
-
formatted_url_hashes = merge_criteria_hashes(formatted_url_hashes)
|
15
|
-
|
16
|
-
formatted_url_hashes.map! do |url_hash|
|
17
|
-
if url_hash[:web_status] != 'invalid' && url_hash[:url_f].present?
|
18
|
-
url_hash[:url_exts] = extract_exts(url_hash)
|
19
|
-
url_hash = scrub_url_hash(url_hash)
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
def merge_criteria_hashes(hashes)
|
25
|
-
hashes.map! do |url_hash|
|
26
|
-
merge_criteria_hash(url_hash)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
def merge_criteria_hash(url_hash)
|
31
|
-
url_hash.merge!(
|
32
|
-
{
|
33
|
-
url_exts: [],
|
34
|
-
neg_exts: [],
|
35
|
-
pos_exts: [],
|
36
|
-
neg_paths: [],
|
37
|
-
pos_paths: [],
|
38
|
-
neg_urls: [],
|
39
|
-
pos_urls: []
|
40
|
-
}
|
41
|
-
)
|
42
|
-
end
|
43
|
-
|
44
|
-
def extract_exts(url_hash)
|
45
|
-
uri_parts = URI(url_hash[:url_f]).host&.split('.')
|
46
|
-
url_exts = uri_parts[2..-1]
|
47
|
-
end
|
48
|
-
|
49
|
-
def scrub_url_hash(url_hash)
|
50
|
-
url = url_hash[:url_f]
|
51
|
-
path = url_hash[:url_path]
|
52
|
-
href = url_hash[:href]
|
53
|
-
url_exts = url_hash[:url_exts]
|
54
|
-
|
55
|
-
url_hash = @filter.scrub_oa(url_hash, url_exts, 'neg_exts', 'equal')
|
56
|
-
url_hash = @filter.scrub_oa(url_hash, url_exts, 'pos_exts', 'equal')
|
57
|
-
url_hash = @filter.scrub_oa(url_hash, url, 'neg_urls', 'include')
|
58
|
-
url_hash = @filter.scrub_oa(url_hash, url, 'pos_urls', 'include')
|
59
|
-
url_hash = @filter.scrub_oa(url_hash, path, 'neg_paths', 'include')
|
60
|
-
url_hash = @filter.scrub_oa(url_hash, path, 'pos_paths', 'include')
|
61
|
-
url_hash
|
62
|
-
end
|
63
|
-
|
64
|
-
# def remove_invalid_links(link)
|
65
|
-
# link_hsh = { link: link, valid_link: nil, flags: nil }
|
66
|
-
# return link_hsh unless link.present?
|
67
|
-
# @neg_paths += get_symbs
|
68
|
-
# flags = @neg_paths.select { |red| link&.include?(red) }
|
69
|
-
# flags << "below #{2}" if link.length < 2
|
70
|
-
# flags << "over #{100}" if link.length > 100
|
71
|
-
# flags = flags.flatten.compact
|
72
|
-
# valid_link = flags.any? ? nil : link
|
73
|
-
# link_hsh[:valid_link] = valid_link
|
74
|
-
# link_hsh[:flags] = flags.join(', ')
|
75
|
-
# binding.pry
|
76
|
-
# link_hsh
|
77
|
-
# end
|
78
|
-
|
79
|
-
# def remove_invalid_hrefs(href)
|
80
|
-
# href_hsh = { href: href, valid_href: nil, flags: nil }
|
81
|
-
# return href_hsh unless href.present?
|
82
|
-
# @neg_hrefs += get_symbs
|
83
|
-
# href = href.split('|').join(' ')
|
84
|
-
# href = href.split('/').join(' ')
|
85
|
-
# href&.gsub!('(', ' ')
|
86
|
-
# href&.gsub!(')', ' ')
|
87
|
-
# href&.gsub!('[', ' ')
|
88
|
-
# href&.gsub!(']', ' ')
|
89
|
-
# href&.gsub!(',', ' ')
|
90
|
-
# href&.gsub!("'", ' ')
|
91
|
-
#
|
92
|
-
# flags = []
|
93
|
-
# flags << "over #{100}" if href.length > 100
|
94
|
-
# invalid_text = Regexp.new(/[0-9]/)
|
95
|
-
# flags << invalid_text&.match(href)
|
96
|
-
# href = href&.downcase
|
97
|
-
# href = href&.strip
|
98
|
-
#
|
99
|
-
# flags << @neg_hrefs.select { |red| href&.include?(red) }
|
100
|
-
# flags = flags.flatten.compact.uniq
|
101
|
-
# href_hsh[:valid_href] = href unless flags.any?
|
102
|
-
# href_hsh[:flags] = flags.join(', ')
|
103
|
-
# href_hsh
|
104
|
-
# end
|
105
|
-
|
106
|
-
end
|
107
|
-
|
108
|
-
end
|