scrub_db 0.0.1.pre.rc.03 → 2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 38b27fd85ba16c4f14ca0542699874b3a101f17eb1faf10fadf163cee9e43d20
4
- data.tar.gz: 321ca87b878ac6e66c7788da976e28dc49fa14d83e77d41f19aafb6c26c4ad7f
3
+ metadata.gz: e6140658a28c9843df6f94a6c5948df622edffeaa905bc37232b229e4ac30b62
4
+ data.tar.gz: 56858739677c75f755583d51e69db49634822671a4e0c2eb346ea537682674c6
5
5
  SHA512:
6
- metadata.gz: 6f6c11ca3c7b1575c23d811cdf9e977b1dd2a0a198533260c42e5fd18f3146c1d8b4bdcb73615c094f34a2463c670ec26ee5a4a74ac3fa29ce9f4e0c6949f156
7
- data.tar.gz: 9ed5a5cbecd487b94f6a99b65e666baba8dc19938d0b111fdd573790aa530d627b79403759996d4ac2188690b6bab9b37e17834f5e948f33638767c05d7c7704
6
+ metadata.gz: 4a606e4be1bc35a3530ede12fa66af81970628c3fe7504f50045bac27a4933b1248e06e9f18362fe573d2b8b28cfd10a2e3b797e6a1795ed1f92711506ca4057
7
+ data.tar.gz: e6b10af82247ebebd24357a9c04393a1e16fbd9708cfba703cb5c764138a336fa8f63824b5d2e3f8c3337eeb9c7083739593006de5a2c7510c4ef41ed6b8902b
data/.rspec_status ADDED
@@ -0,0 +1,4 @@
1
+ example_id | status | run_time |
2
+ ---------------------------- | ------ | --------------- |
3
+ ./spec/scrub_db_spec.rb[1:1] | passed | 0.00148 seconds |
4
+ ./spec/scrub_db_spec.rb[1:2] | failed | 0.01648 seconds |
data/README.md CHANGED
@@ -1,5 +1,5 @@
1
1
  # ScrubDb
2
- #### Scrub data with your custom criteria. Returns detailed reporting.
2
+ #### Scrub your database, api data, web scraping data, and web form submissions based on your your custom criteria. Allows for different criteria for different jobs. Returns detailed reporting to zero-in on your data with ease, efficiency, and greater insight. Allows for option to pre-format data before scrubbing to also normalize and standardize your data sets, ex uniform URL patterns
3
3
 
4
4
  ## Installation
5
5
 
@@ -19,17 +19,65 @@ Or install it yourself as:
19
19
 
20
20
  ## Usage
21
21
 
22
- More methods coming soon. Currently, Scrub Array of URLs is fully functional.
22
+ ### I. Usage Overview
23
+
24
+
25
+ #### Step 1: Load Your Scrub Criteria:
26
+
27
+ ##### 1. For String Criteria
28
+ ```
29
+ strings_criteria = {
30
+ pos_criteria: %w[your positive criteria here],
31
+ neg_criteria: %w[your negative criteria here]
32
+ }
33
+ strings_obj = ScrubDb::Strings.new(strings_criteria)
34
+ ```
35
+
36
+ ##### 2. For Web Criteria
37
+ ```
38
+ webs_criteria = {
39
+ pos_criteria: %w[your positive criteria here],
40
+ neg_criteria: %w[your negative criteria here]
41
+ }
42
+ webs_obj = ScrubDb::Webs.new(webs_criteria)
43
+ ```
44
+
45
+
46
+
47
+ #### Step 2: Load Your Data to Scrub:
48
+
49
+ ##### Methods available to scrub data:
50
+
51
+ ##### 1. Scrub URLs:
52
+ ```
53
+ scrub_web_obj = ScrubDb::Webs.new(criteria)
54
+ scrubbed_web_hashes = scrub_web_obj.scrub_urls(urls)
55
+ ```
56
+
57
+ ##### 2. Scrub Strings:
58
+ ```
59
+ strings_obj = ScrubDb::Strings.new(strings_criteria)
60
+ scrubbed_strings = strings_obj.scrub_strings(array_of_strings)
61
+ ```
62
+
63
+ ##### 3. Scrub Proper Strings:
64
+ ```
65
+ strings_obj = ScrubDb::Strings.new(strings_criteria)
66
+ scrubbed_prop_strings = strings_obj.scrub_proper_strings(array_of_props)
67
+ ```
68
+
69
+
70
+ ### II. Usage Details
23
71
 
24
72
  ### 1. Scrub Array of URLs:
25
73
  This is an example of scrubbing auto dealership urls. We only want URLs based in the US, and paths of the staff. Most of our URLs are good, but we want to confirm that they all meet our requirements.
26
74
 
27
75
  ### A. Pass in Scrub Criteria
28
- First step is to load your web criteria in hash format. It's not required to enter all the keys below, but for those you are using, each key must be a symbol and be exactly the same as the ones below. The values must each be an array of strings.
76
+ First step is to load your Webs criteria in hash format. It's not required to enter all the keys below, but for those you are using, each key must be a symbol and be exactly the same as the ones below. The values must each be an array of strings.
29
77
 
30
78
  ```
31
79
  criteria = {
32
- neg_urls: %w[pprov avis budget collis eat],
80
+ neg_urls: %w[aprov avis budget collis eat],
33
81
  pos_urls: %w[acura audi bmw bentley],
34
82
  neg_paths: %w[buy bye call cash cheap click collis cont distrib],
35
83
  pos_paths: %w[team staff management],
@@ -37,17 +85,18 @@ criteria = {
37
85
  pos_exts: %w[com net]
38
86
  }
39
87
 
40
- web_obj = ScrubDb::Web.new(criteria)
88
+ scrub_web_obj = ScrubDb::Webs.new(criteria)
41
89
  ```
42
90
 
43
91
  ### B. Pass in URLs List
92
+
44
93
  Next, pass your list of URLs to `scrub_urls(urls)` with the syntax below.
45
94
 
46
95
  ```
47
96
  urls = %w[
97
+ austinchevrolet.not.real
48
98
  smith_acura.com/staff
49
99
  abcrepair.ca
50
- austinchevrolet.not.real
51
100
  hertzrentals.com/review
52
101
  londonhyundai.uk/fleet
53
102
  http://www.townbuick.net/staff
@@ -62,7 +111,7 @@ urls = %w[
62
111
  www.www.yellowpages.com/business
63
112
  ]
64
113
 
65
- scrubbed_web_hashes = web_obj.scrub_urls(urls)
114
+ scrubbed_web_hashes = scrub_web_obj.scrub_urls(urls)
66
115
  ```
67
116
 
68
117
  ### C. Returned Results
@@ -71,7 +120,20 @@ Notice that the URLs in the list above are NOT uniformly formatted. ScrubDb lev
71
120
  ```
72
121
  scrubbed_web_hashes = [
73
122
  {
74
- web_status: 'formatted',
123
+ web_status: 'invalid',
124
+ url: 'austinchevrolet.not.real',
125
+ url_f: nil,
126
+ url_path: nil,
127
+ web_neg: 'error: ext.invalid [not, real]',
128
+ url_exts: [],
129
+ neg_exts: [],
130
+ pos_exts: [],
131
+ neg_paths: [],
132
+ pos_paths: [],
133
+ neg_urls: [],
134
+ pos_urls: []
135
+ },
136
+ { web_status: 'formatted',
75
137
  url: 'smith_acura.com/staff',
76
138
  url_f: 'http://www.smith_acura.com',
77
139
  url_path: '/staff',
@@ -84,8 +146,7 @@ scrubbed_web_hashes = [
84
146
  neg_urls: [],
85
147
  pos_urls: ['acura']
86
148
  },
87
- {
88
- web_status: 'formatted',
149
+ { web_status: 'formatted',
89
150
  url: 'abcrepair.ca',
90
151
  url_f: 'http://www.abcrepair.ca',
91
152
  url_path: nil,
@@ -98,8 +159,7 @@ scrubbed_web_hashes = [
98
159
  neg_urls: ['repair'],
99
160
  pos_urls: []
100
161
  },
101
- {
102
- web_status: 'formatted',
162
+ { web_status: 'formatted',
103
163
  url: 'hertzrentals.com/review',
104
164
  url_f: 'http://www.hertzrentals.com',
105
165
  url_path: '/review',
@@ -112,8 +172,7 @@ scrubbed_web_hashes = [
112
172
  neg_urls: ['hertz, rent'],
113
173
  pos_urls: []
114
174
  },
115
- {
116
- web_status: 'formatted',
175
+ { web_status: 'formatted',
117
176
  url: 'londonhyundai.uk/fleet',
118
177
  url_f: 'http://www.londonhyundai.uk',
119
178
  url_path: '/fleet',
@@ -126,8 +185,7 @@ scrubbed_web_hashes = [
126
185
  neg_urls: [],
127
186
  pos_urls: ['hyundai']
128
187
  },
129
- {
130
- web_status: 'formatted',
188
+ { web_status: 'formatted',
131
189
  url: 'http://www.townbuick.net/staff',
132
190
  url_f: 'http://www.townbuick.net',
133
191
  url_path: nil,
@@ -140,8 +198,7 @@ scrubbed_web_hashes = [
140
198
  neg_urls: [],
141
199
  pos_urls: ['buick']
142
200
  },
143
- {
144
- web_status: 'formatted',
201
+ { web_status: 'formatted',
145
202
  url: 'http://youtube.com/download',
146
203
  url_f: 'http://www.youtube.com',
147
204
  url_path: nil,
@@ -154,8 +211,7 @@ scrubbed_web_hashes = [
154
211
  neg_urls: ['youtube'],
155
212
  pos_urls: []
156
213
  },
157
- {
158
- web_status: 'formatted',
214
+ { web_status: 'formatted',
159
215
  url: 'www.madridinfiniti.es/collision',
160
216
  url_f: 'http://www.madridinfiniti.es',
161
217
  url_path: '/collision',
@@ -168,8 +224,20 @@ scrubbed_web_hashes = [
168
224
  neg_urls: [],
169
225
  pos_urls: ['infiniti']
170
226
  },
171
- {
172
- web_status: 'formatted',
227
+ { web_status: 'invalid',
228
+ url: 'www.mitsubishideals.sofake',
229
+ url_f: nil,
230
+ url_path: nil,
231
+ web_neg: 'error: ext.invalid [sofake]',
232
+ url_exts: [],
233
+ neg_exts: [],
234
+ pos_exts: [],
235
+ neg_paths: [],
236
+ pos_paths: [],
237
+ neg_urls: [],
238
+ pos_urls: []
239
+ },
240
+ { web_status: 'formatted',
173
241
  url: 'www.dallassubaru.com.sofake',
174
242
  url_f: 'http://www.dallassubaru.com',
175
243
  url_path: nil,
@@ -182,8 +250,7 @@ scrubbed_web_hashes = [
182
250
  neg_urls: [],
183
251
  pos_urls: ['subaru']
184
252
  },
185
- {
186
- web_status: 'formatted',
253
+ { web_status: 'formatted',
187
254
  url: 'www.quickeats.net/contact_us',
188
255
  url_f: 'http://www.quickeats.net',
189
256
  url_path: '/contact_us',
@@ -196,8 +263,7 @@ scrubbed_web_hashes = [
196
263
  neg_urls: ['eat, quick'],
197
264
  pos_urls: []
198
265
  },
199
- {
200
- web_status: 'formatted',
266
+ { web_status: 'formatted',
201
267
  url: 'www.school.edu/teachers',
202
268
  url_f: 'http://www.school.edu',
203
269
  url_path: '/teachers',
@@ -210,8 +276,20 @@ scrubbed_web_hashes = [
210
276
  neg_urls: [],
211
277
  pos_urls: []
212
278
  },
213
- {
214
- web_status: 'formatted',
279
+ { web_status: 'invalid',
280
+ url: 'www.www.nissancars/inventory',
281
+ url_f: nil,
282
+ url_path: nil,
283
+ web_neg: 'error: ext.none',
284
+ url_exts: [],
285
+ neg_exts: [],
286
+ pos_exts: [],
287
+ neg_paths: [],
288
+ pos_paths: [],
289
+ neg_urls: [],
290
+ pos_urls: []
291
+ },
292
+ { web_status: 'formatted',
215
293
  url: 'www.www.toyotatown.net/staff/management',
216
294
  url_f: 'http://www.toyotatown.net',
217
295
  url_path: '/staff/management',
@@ -220,12 +298,11 @@ scrubbed_web_hashes = [
220
298
  neg_exts: [],
221
299
  pos_exts: ['net'],
222
300
  neg_paths: [],
223
- pos_paths: ['staff, management'],
301
+ pos_paths: ['management, staff'],
224
302
  neg_urls: [],
225
303
  pos_urls: ['toyota']
226
304
  },
227
- {
228
- web_status: 'formatted',
305
+ { web_status: 'formatted',
229
306
  url: 'www.www.yellowpages.com/business',
230
307
  url_f: 'http://www.yellowpages.com',
231
308
  url_path: '/business',
@@ -242,6 +319,313 @@ scrubbed_web_hashes = [
242
319
  ```
243
320
 
244
321
 
322
+ ### 2. Scrub Array of Strings:
323
+
324
+ You can scrub an array of strings with or without formatting.
325
+ For scrubbing proper strings (account and business names, job titles, article titles, brands, locations, etc.) like below, you might prefer the proper scrub method, but these examples will use the same criteria and same array of strings to illustrate the difference.
326
+
327
+ Continuing with the auto dealership example above, the following examples are to scrub the auto dealership account names. We want to prioritize our data based on those who match our positive criteria, those who match our negative criteria, and those who are neutral.
328
+
329
+ ### A. Pass in Scrub Criteria
330
+ First step is to load your Strings criteria in hash format. It's not required to enter all the keys below, but for those you are using, each key must be a symbol and be exactly the same as the ones below. The values must each be an array of strings.
331
+
332
+ ```
333
+ strings_criteria = {
334
+ neg_urls: %w[aprov avis budget collis eat],
335
+ pos_urls: %w[acura audi bmw bentley],
336
+ neg_paths: %w[buy bye call cash cheap click collis cont distrib],
337
+ pos_paths: %w[team staff management],
338
+ neg_exts: %w[au ca edu es gov in ru uk us],
339
+ pos_exts: %w[com net]
340
+ }
341
+
342
+ strings_obj = ScrubDb::Strings.new(strings_criteria)
343
+ ```
344
+
345
+ ### B. Pass in Strings List
346
+
347
+ Next, pass your list of strings to `scrub_strings(strings)` with the syntax below.
348
+
349
+ ```
350
+ array_of_strings = [
351
+ 'quick auto approval, inc',
352
+ 'the gmc and bmw-world of AUSTIN tx',
353
+ 'DOWNTOWN CAR REPAIR, INC',
354
+ 'TEXAS TRAVEL, CO',
355
+ '123 Car-world Kia OF CHICAGO IL',
356
+ 'Main Street Ford in DALLAS tX',
357
+ 'broad st fiat of houston',
358
+ 'hot-deal auto insurance',
359
+ 'BUDGET - AUTOMOTORES ZONA & FRANCA, INC',
360
+ 'Young Gmc Trucks',
361
+ 'youmans Chevrolet',
362
+ 'yazell chevy',
363
+ 'quick cAr LUBE',
364
+ 'yAtEs AuTo maLL',
365
+ 'YADKIN VALLEY COLLISION CO',
366
+ 'XIT FORD INC'
367
+ ]
368
+
369
+ scrubbed_strings = strings_obj.scrub_strings(array_of_strings)
370
+ ```
371
+
372
+ ### C. Returned Results
373
+
374
+ ```
375
+ scrubbed_strings = [
376
+ {
377
+ string: 'quick auto approval, inc',
378
+ pos_criteria: [],
379
+ neg_criteria: ['approv, quick']
380
+ },
381
+ {
382
+ string: 'the gmc and bmw-world of AUSTIN tx',
383
+ pos_criteria: ['bmw, gmc'],
384
+ neg_criteria: []
385
+ },
386
+ {
387
+ string: 'DOWNTOWN CAR REPAIR, INC',
388
+ pos_criteria: [],
389
+ neg_criteria: ['repair']
390
+ },
391
+ {
392
+ string: 'TEXAS TRAVEL, CO',
393
+ pos_criteria: [],
394
+ neg_criteria: ['travel']
395
+ },
396
+ {
397
+ string: '123 Car-world Kia OF CHICAGO IL',
398
+ pos_criteria: ['kia'],
399
+ neg_criteria: []
400
+ },
401
+ {
402
+ string: 'Main Street Ford in DALLAS tX',
403
+ pos_criteria: ['ford'],
404
+ neg_criteria: []
405
+ },
406
+ {
407
+ string: 'broad st fiat of houston',
408
+ pos_criteria: ['fiat'],
409
+ neg_criteria: []
410
+ },
411
+ {
412
+ string: 'hot-deal auto insurance',
413
+ pos_criteria: [],
414
+ neg_criteria: ['insur']
415
+ },
416
+ {
417
+ string: 'BUDGET - AUTOMOTORES ZONA & FRANCA, INC',
418
+ pos_criteria: [],
419
+ neg_criteria: ['budget']
420
+ },
421
+ {
422
+ string: 'Young Gmc Trucks',
423
+ pos_criteria: ['gmc'],
424
+ neg_criteria: []
425
+ },
426
+ {
427
+ string: 'youmans Chevrolet',
428
+ pos_criteria: ['chevrolet'],
429
+ neg_criteria: []
430
+ },
431
+ {
432
+ string: 'yazell chevy',
433
+ pos_criteria: [],
434
+ neg_criteria: []
435
+ },
436
+ {
437
+ string: 'quick cAr LUBE',
438
+ pos_criteria: [],
439
+ neg_criteria: ['lube, quick']
440
+ },
441
+ {
442
+ string: 'yAtEs AuTo maLL',
443
+ pos_criteria: [],
444
+ neg_criteria: []
445
+ },
446
+ {
447
+ string: 'YADKIN VALLEY COLLISION CO',
448
+ pos_criteria: [],
449
+ neg_criteria: ['collis']
450
+ },
451
+ {
452
+ string: 'XIT FORD INC',
453
+ pos_criteria: ['ford'],
454
+ neg_criteria: []
455
+ }
456
+ ]
457
+ ```
458
+
459
+
460
+ ### 3. Scrub Array of Proper Strings:
461
+ This method is designed for scrubbing proper strings, like account and business names, job titles, article titles, brands, locations, etc.
462
+
463
+ This method is identical to example 2 above (Scrub Array of Strings), except this method first formats the strings using the `Utf8Sanitizer gem` and `CrmFormatter gem`, then passes the results to the method above to scrub. So, this is a 2-in-1 method, Format + Scrub! Again, this method treats your strings as if they are proper nouns, so compare the results of these two methods to determine which is most suitable for your data.
464
+
465
+ ### A. Pass in Scrub Criteria
466
+ First step is to load your Strings criteria in hash format. It's not required to enter all the keys below, but for those you are using, each key must be a symbol and be exactly the same as the ones below. The values must each be an array of strings.
467
+
468
+ ```
469
+ strings_criteria = {
470
+ neg_urls: %w[aprov avis budget collis eat],
471
+ pos_urls: %w[acura audi bmw bentley],
472
+ neg_paths: %w[buy bye call cash cheap click collis cont distrib],
473
+ pos_paths: %w[team staff management],
474
+ neg_exts: %w[au ca edu es gov in ru uk us],
475
+ pos_exts: %w[com net]
476
+ }
477
+
478
+ strings_obj = ScrubDb::Strings.new(strings_criteria)
479
+ ```
480
+
481
+ ### B. Pass in Strings List
482
+
483
+ Next, pass your list of strings to `scrub_proper_strings(strings)` with the syntax below.
484
+
485
+ ```
486
+ array_of_strings = [
487
+ 'quick auto approval, inc',
488
+ 'the gmc and bmw-world of AUSTIN tx',
489
+ 'DOWNTOWN CAR REPAIR, INC',
490
+ 'TEXAS TRAVEL, CO',
491
+ '123 Car-world Kia OF CHICAGO IL',
492
+ 'Main Street Ford in DALLAS tX',
493
+ 'broad st fiat of houston',
494
+ 'hot-deal auto insurance',
495
+ 'BUDGET - AUTOMOTORES ZONA & FRANCA, INC',
496
+ 'Young Gmc Trucks',
497
+ 'youmans Chevrolet',
498
+ 'yazell chevy',
499
+ 'quick cAr LUBE',
500
+ 'yAtEs AuTo maLL',
501
+ 'YADKIN VALLEY COLLISION CO',
502
+ 'XIT FORD INC'
503
+ ]
504
+
505
+ scrubbed_proper_strings = strings_obj.scrub_proper_strings(array_of_strings)
506
+ ```
507
+
508
+ ### C. Returned Results
509
+
510
+ ```
511
+ scrubbed_proper_strings = [
512
+ {
513
+ proper_status: 'formatted',
514
+ proper: 'quick auto approval, inc',
515
+ proper_f: 'Quick Auto Approval, Inc',
516
+ pos_criteria: [],
517
+ neg_criteria: ['approv, quick']
518
+ },
519
+ {
520
+ proper_status: 'formatted',
521
+ proper: 'the gmc and bmw-world of AUSTIN tx',
522
+ proper_f: 'The GMC and BMW-World of Austin TX',
523
+ pos_criteria: ['bmw, gmc'],
524
+ neg_criteria: []
525
+ },
526
+ {
527
+ proper_status: 'formatted',
528
+ proper: 'DOWNTOWN CAR REPAIR, INC',
529
+ proper_f: 'Downtown Car Repair, Inc',
530
+ pos_criteria: [],
531
+ neg_criteria: ['repair']
532
+ },
533
+ {
534
+ proper_status: 'formatted',
535
+ proper: 'TEXAS TRAVEL, CO',
536
+ proper_f: 'Texas Travel, Co',
537
+ pos_criteria: [],
538
+ neg_criteria: ['travel']
539
+ },
540
+ {
541
+ proper_status: 'formatted',
542
+ proper: '123 Car-world Kia OF CHICAGO IL',
543
+ proper_f: '123 Car-World Kia of Chicago IL',
544
+ pos_criteria: ['kia'],
545
+ neg_criteria: []
546
+ },
547
+ {
548
+ proper_status: 'formatted',
549
+ proper: 'Main Street Ford in DALLAS tX',
550
+ proper_f: 'Main Street Ford in Dallas TX',
551
+ pos_criteria: ['ford'],
552
+ neg_criteria: []
553
+ },
554
+ {
555
+ proper_status: 'formatted',
556
+ proper: 'broad st fiat of houston',
557
+ proper_f: 'Broad St Fiat of Houston',
558
+ pos_criteria: ['fiat'],
559
+ neg_criteria: []
560
+ },
561
+ {
562
+ proper_status: 'formatted',
563
+ proper: 'hot-deal auto insurance',
564
+ proper_f: 'Hot-Deal Auto Insurance',
565
+ pos_criteria: [],
566
+ neg_criteria: ['insur']
567
+ },
568
+ {
569
+ proper_status: 'formatted',
570
+ proper: 'BUDGET - AUTOMOTORES ZONA & FRANCA, INC',
571
+ proper_f: 'Budget - Automotores Zona & Franca, Inc',
572
+ pos_criteria: [],
573
+ neg_criteria: ['budget']
574
+ },
575
+ {
576
+ proper_status: 'formatted',
577
+ proper: 'Young Gmc Trucks',
578
+ proper_f: 'Young GMC Trucks',
579
+ pos_criteria: ['gmc'],
580
+ neg_criteria: []
581
+ },
582
+ {
583
+ proper_status: 'formatted',
584
+ proper: 'youmans Chevrolet',
585
+ proper_f: 'Youmans Chevrolet',
586
+ pos_criteria: ['chevrolet'],
587
+ neg_criteria: []
588
+ },
589
+ {
590
+ proper_status: 'formatted',
591
+ proper: 'yazell chevy',
592
+ proper_f: 'Yazell Chevy',
593
+ pos_criteria: [],
594
+ neg_criteria: []
595
+ },
596
+ {
597
+ proper_status: 'formatted',
598
+ proper: 'quick cAr LUBE',
599
+ proper_f: 'Quick Car Lube',
600
+ pos_criteria: [],
601
+ neg_criteria: ['lube, quick']
602
+ },
603
+ {
604
+ proper_status: 'formatted',
605
+ proper: 'yAtEs AuTo maLL',
606
+ proper_f: 'Yates Auto Mall',
607
+ pos_criteria: [],
608
+ neg_criteria: []
609
+ },
610
+ {
611
+ proper_status: 'formatted',
612
+ proper: 'YADKIN VALLEY COLLISION CO',
613
+ proper_f: 'Yadkin Valley Collision Co',
614
+ pos_criteria: [],
615
+ neg_criteria: ['collis']
616
+ },
617
+ {
618
+ proper_status: 'formatted',
619
+ proper: 'XIT FORD INC',
620
+ proper_f: 'Xit Ford Inc',
621
+ pos_criteria: ['ford'],
622
+ neg_criteria: []
623
+ }
624
+ ]
625
+
626
+ ```
627
+
628
+
245
629
  ## Author
246
630
 
247
631
  Adam J Booth - [4rlm](https://github.com/4rlm)
data/Rakefile CHANGED
@@ -1,7 +1,7 @@
1
1
  require "bundler/gem_tasks"
2
2
  require "rspec/core/rake_task"
3
3
  require 'scrub_db'
4
- require 'web_criteria'
4
+ require 'webs_criteria'
5
5
 
6
6
 
7
7
  RSpec::Core::RakeTask.new(:spec)
@@ -17,17 +17,81 @@ task :console do
17
17
  require "active_support/all"
18
18
  ARGV.clear
19
19
 
20
- scrubbed_urls = scrub_sample_urls
21
- binding.pry
20
+ scrubbed_webs = run_scrub_webs
21
+ # scrubbed_strings = run_scrub_strings
22
+ # scrubbed_proper_strings = run_scrub_proper_strings
23
+ # binding.pry
22
24
 
23
25
  IRB.start
24
26
  end
25
27
 
26
- def scrub_sample_urls
28
+
29
+ def run_scrub_strings
30
+ strings_criteria = {
31
+ pos_criteria: WebsCriteria.seed_pos_urls,
32
+ neg_criteria: WebsCriteria.seed_neg_urls
33
+ }
34
+
35
+ array_of_strings = [
36
+ 'quick auto approval, inc',
37
+ 'the gmc and bmw-world of AUSTIN tx',
38
+ 'DOWNTOWN CAR REPAIR, INC',
39
+ 'TEXAS TRAVEL, CO',
40
+ '123 Car-world Kia OF CHICAGO IL',
41
+ 'Main Street Ford in DALLAS tX',
42
+ 'broad st fiat of houston',
43
+ 'hot-deal auto insurance',
44
+ 'BUDGET - AUTOMOTORES ZONA & FRANCA, INC',
45
+ 'Young Gmc Trucks',
46
+ 'youmans Chevrolet',
47
+ 'yazell chevy',
48
+ 'quick cAr LUBE',
49
+ 'yAtEs AuTo maLL',
50
+ 'YADKIN VALLEY COLLISION CO',
51
+ 'XIT FORD INC'
52
+ ]
53
+
54
+ strings_obj = ScrubDb::Strings.new(strings_criteria)
55
+ scrubbed_strings = strings_obj.scrub_strings(array_of_strings)
56
+ end
57
+
58
+
59
+ def run_scrub_proper_strings
60
+ strings_criteria = {
61
+ pos_criteria: WebsCriteria.seed_pos_urls,
62
+ neg_criteria: WebsCriteria.seed_neg_urls
63
+ }
64
+
65
+ array_of_propers = [
66
+ 'quick auto approval, inc',
67
+ 'the gmc and bmw-world of AUSTIN tx',
68
+ 'DOWNTOWN CAR REPAIR, INC',
69
+ 'TEXAS TRAVEL, CO',
70
+ '123 Car-world Kia OF CHICAGO IL',
71
+ 'Main Street Ford in DALLAS tX',
72
+ 'broad st fiat of houston',
73
+ 'hot-deal auto insurance',
74
+ 'BUDGET - AUTOMOTORES ZONA & FRANCA, INC',
75
+ 'Young Gmc Trucks',
76
+ 'youmans Chevrolet',
77
+ 'yazell chevy',
78
+ 'quick cAr LUBE',
79
+ 'yAtEs AuTo maLL',
80
+ 'YADKIN VALLEY COLLISION CO',
81
+ 'XIT FORD INC'
82
+ ]
83
+
84
+ strings_obj = ScrubDb::Strings.new(strings_criteria)
85
+ scrubbed_proper_strings = strings_obj.scrub_proper_strings(array_of_propers)
86
+ end
87
+
88
+
89
+
90
+ def run_scrub_webs
27
91
  urls = %w[
92
+ austinchevrolet.not.real
28
93
  smith_acura.com/staff
29
94
  abcrepair.ca
30
- austinchevrolet.not.real
31
95
  hertzrentals.com/review
32
96
  londonhyundai.uk/fleet
33
97
  http://www.townbuick.net/staff
@@ -42,6 +106,6 @@ def scrub_sample_urls
42
106
  www.www.yellowpages.com/business
43
107
  ]
44
108
 
45
- web_obj = ScrubDb::Web.new(WebCriteria.all_web_criteria)
46
- scrubbed_webs = web_obj.scrub_urls(urls)
109
+ webs_obj = ScrubDb::Webs.new(WebsCriteria.all_scrub_web_criteria)
110
+ scrubbed_webs = webs_obj.scrub_urls(urls)
47
111
  end
data/junk.rb ADDED
@@ -0,0 +1,114 @@
1
+ [
2
+ {
3
+ proper_status: 'formatted',
4
+ proper: 'quick auto approval, inc',
5
+ proper_f: 'Quick Auto Approval, Inc',
6
+ pos_criteria: [],
7
+ neg_criteria: ['approv, quick']
8
+ },
9
+ {
10
+ proper_status: 'formatted',
11
+ proper: 'the gmc and bmw-world of AUSTIN tx',
12
+ proper_f: 'The GMC and BMW-World of Austin TX',
13
+ pos_criteria: ['bmw, gmc'],
14
+ neg_criteria: []
15
+ },
16
+ {
17
+ proper_status: 'formatted',
18
+ proper: 'DOWNTOWN CAR REPAIR, INC',
19
+ proper_f: 'Downtown Car Repair, Inc',
20
+ pos_criteria: [],
21
+ neg_criteria: ['repair']
22
+ },
23
+ {
24
+ proper_status: 'formatted',
25
+ proper: 'TEXAS TRAVEL, CO',
26
+ proper_f: 'Texas Travel, Co',
27
+ pos_criteria: [],
28
+ neg_criteria: ['travel']
29
+ },
30
+ {
31
+ proper_status: 'formatted',
32
+ proper: '123 Car-world Kia OF CHICAGO IL',
33
+ proper_f: '123 Car-World Kia of Chicago IL',
34
+ pos_criteria: ['kia'],
35
+ neg_criteria: []
36
+ },
37
+ {
38
+ proper_status: 'formatted',
39
+ proper: 'Main Street Ford in DALLAS tX',
40
+ proper_f: 'Main Street Ford in Dallas TX',
41
+ pos_criteria: ['ford'],
42
+ neg_criteria: []
43
+ },
44
+ {
45
+ proper_status: 'formatted',
46
+ proper: 'broad st fiat of houston',
47
+ proper_f: 'Broad St Fiat of Houston',
48
+ pos_criteria: ['fiat'],
49
+ neg_criteria: []
50
+ },
51
+ {
52
+ proper_status: 'formatted',
53
+ proper: 'hot-deal auto insurance',
54
+ proper_f: 'Hot-Deal Auto Insurance',
55
+ pos_criteria: [],
56
+ neg_criteria: ['insur']
57
+ },
58
+ {
59
+ proper_status: 'formatted',
60
+ proper: 'BUDGET - AUTOMOTORES ZONA & FRANCA, INC',
61
+ proper_f: 'Budget - Automotores Zona & Franca, Inc',
62
+ pos_criteria: [],
63
+ neg_criteria: ['budget']
64
+ },
65
+ {
66
+ proper_status: 'formatted',
67
+ proper: 'Young Gmc Trucks',
68
+ proper_f: 'Young GMC Trucks',
69
+ pos_criteria: ['gmc'],
70
+ neg_criteria: []
71
+ },
72
+ {
73
+ proper_status: 'formatted',
74
+ proper: 'youmans Chevrolet',
75
+ proper_f: 'Youmans Chevrolet',
76
+ pos_criteria: ['chevrolet'],
77
+ neg_criteria: []
78
+ },
79
+ {
80
+ proper_status: 'formatted',
81
+ proper: 'yazell chevy',
82
+ proper_f: 'Yazell Chevy',
83
+ pos_criteria: [],
84
+ neg_criteria: []
85
+ },
86
+ {
87
+ proper_status: 'formatted',
88
+ proper: 'quick cAr LUBE',
89
+ proper_f: 'Quick Car Lube',
90
+ pos_criteria: [],
91
+ neg_criteria: ['lube, quick']
92
+ },
93
+ {
94
+ proper_status: 'formatted',
95
+ proper: 'yAtEs AuTo maLL',
96
+ proper_f: 'Yates Auto Mall',
97
+ pos_criteria: [],
98
+ neg_criteria: []
99
+ },
100
+ {
101
+ proper_status: 'formatted',
102
+ proper: 'YADKIN VALLEY COLLISION CO',
103
+ proper_f: 'Yadkin Valley Collision Co',
104
+ pos_criteria: [],
105
+ neg_criteria: ['collis']
106
+ },
107
+ {
108
+ proper_status: 'formatted',
109
+ proper: 'XIT FORD INC',
110
+ proper_f: 'Xit Ford Inc',
111
+ pos_criteria: ['ford'],
112
+ neg_criteria: []
113
+ }
114
+ ]
@@ -5,47 +5,66 @@ module ScrubDb
5
5
 
6
6
  def initialize(args={})
7
7
  @args = args
8
- # @global_hash = grab_global_hash
9
8
  @empty_criteria = args.empty?
10
9
  end
11
10
 
12
11
  def scrub_oa(hash, target, oa_name, include_or_equal)
13
12
  return hash unless oa_name.present? && !@empty_criteria && target.present?
14
- criteria = @args.fetch(oa_name.to_sym, [])
13
+ criteria = fetch_criteria(oa_name)
15
14
 
16
15
  return hash unless criteria.any?
17
- tars = target.is_a?(::String) ? target.split(', ') : target
18
- binding.pry if !tars.present?
16
+ target = prep_target(target)
17
+ tars = target_to_tars(target)
18
+ scrub_matches = match_criteria(tars, include_or_equal, criteria)
19
+ string_match = stringify_matches(scrub_matches)
20
+ hash = match_to_hash(hash, string_match, oa_name)
21
+ end
22
+
23
+ def match_to_hash(hsh, match, oa_name)
24
+ return hsh unless match.present?
25
+ hsh[oa_name.to_sym] << match
26
+ hsh
27
+ end
19
28
 
29
+ def stringify_matches(matches=[])
30
+ string_match = matches&.uniq&.sort&.join(', ') if matches.any?
31
+ end
32
+
33
+ def fetch_criteria(oa_name)
34
+ criteria = @args.fetch(oa_name.to_sym, [])
35
+ criteria = criteria&.map(&:downcase)
36
+ end
37
+
38
+
39
+ def match_criteria(tars, include_or_equal, criteria)
20
40
  scrub_matches = tars.map do |tar|
21
- return hash unless criteria.present?
22
41
  if include_or_equal == 'include'
23
- criteria.select { |crit| crit if tar.include?(crit) }.join(', ')
42
+ criteria.map { |crit| crit if tar.include?(crit) }
24
43
  elsif include_or_equal == 'equal'
25
- criteria.select { |crit| crit if tar == crit }.join(', ')
44
+ criteria.map { |crit| crit if tar == crit }
26
45
  end
27
46
  end
47
+ scrub_matches = scrub_matches.flatten.compact
48
+ end
28
49
 
29
- scrub_match = scrub_matches&.uniq&.sort&.join(', ')
30
- return hash unless scrub_match.present?
31
-
32
- hash[oa_name.to_sym] << scrub_match
33
- hash
50
+ def prep_target(target)
51
+ target = target.join if target.is_a?(Array)
52
+ target = target.downcase
53
+ target = target.gsub(',', ' ')
54
+ target = target.gsub('-', ' ')
55
+ target = target.squeeze(' ')
56
+ end
34
57
 
35
- ### Delete below after testing above. ###
36
- # scrub_match = scrub_matches&.uniq&.sort&.join(', ')
37
- # return hash unless scrub_match.present?
38
- # if oa_name.include?('web_neg')
39
- # hash[:web_neg] << "#{oa_name}: #{scrub_match}"
40
- # else
41
- # hash[:web_pos] << "#{oa_name}: #{scrub_match}"
42
- # end
58
+ def target_to_tars(target)
59
+ tars = target.is_a?(::String) ? target.split(' ') : target
43
60
  end
61
+
62
+
44
63
  ######################################
45
64
 
46
65
 
47
66
  # def grab_global_hash
48
- # keys = %i[row_id act_name street city state zip full_addr phone url street_f city_f state_f zip_f full_addr_f phone_f url_f url_path web_neg address_status phone_status web_status utf_status]
67
+ # keys = %i[row_id act_name street city state zip full_addr phone url street_f city_f state_f zip_f full_addr_f phone_f url_f url_path ScrubWeb_neg address_status phone_status ScrubWeb_status utf_status]
49
68
  # @global_hash = Hash[keys.map { |a| [a, nil] }]
50
69
  # end
51
70
 
@@ -0,0 +1,52 @@
1
+
2
+
3
+ module ScrubDb
4
+ class Strings
5
+ # attr_accessor :headers, :valid_rows, :encoded_rows, :row_id, :data_hash, :defective_rows, :error_rows
6
+
7
+ def initialize(criteria={})
8
+ @empty_criteria = criteria&.empty?
9
+ @filter = ScrubDb::Filter.new(criteria) unless @empty_criteria
10
+ end
11
+
12
+ def scrub_proper_strings(props=[])
13
+ prop_hashes = CrmFormatter.format_propers(props)
14
+ prop_hashes = merge_criteria(prop_hashes)
15
+ prop_hashes.map! { |prop_hsh| scrub_hash(prop_hsh) }
16
+ end
17
+
18
+ def scrub_strings(strings=[])
19
+ str_hashes = strings_to_hashes(strings)
20
+ str_hashes = merge_criteria(str_hashes)
21
+ str_hashes.map! { |str_hsh| scrub_hash(str_hsh) }
22
+ end
23
+
24
+ def strings_to_hashes(strings)
25
+ str_hashes = strings.map { |str| { string: str } }
26
+ end
27
+
28
+ def merge_criteria(hashes)
29
+ hashes.map do |hsh|
30
+ hsh.merge({ pos_criteria: [], neg_criteria: [] })
31
+ end
32
+ end
33
+
34
+ def scrub_hash(hsh)
35
+ str = hsh[:string]
36
+ prop = hsh[:proper_f]
37
+
38
+ if str.present?
39
+ hsh = @filter.scrub_oa(hsh, str, 'neg_criteria', 'include')
40
+ hsh = @filter.scrub_oa(hsh, str, 'pos_criteria', 'include')
41
+ end
42
+
43
+ if prop.present?
44
+ hsh = @filter.scrub_oa(hsh, prop, 'neg_criteria', 'include')
45
+ hsh = @filter.scrub_oa(hsh, prop, 'pos_criteria', 'include')
46
+ end
47
+ hsh
48
+ end
49
+
50
+ end
51
+
52
+ end
@@ -1,3 +1,3 @@
1
1
  module ScrubDb
2
- VERSION = "0.0.1.pre.rc.03"
2
+ VERSION = "2.0"
3
3
  end
@@ -0,0 +1,70 @@
1
+
2
+
3
+ module ScrubDb
4
+ class Webs
5
+ # attr_accessor :headers, :valid_rows, :encoded_rows, :row_id, :data_hash, :defective_rows, :error_rows
6
+
7
+ def initialize(criteria={})
8
+ @empty_criteria = criteria&.empty?
9
+ @filter = ScrubDb::Filter.new(criteria) unless @empty_criteria
10
+ end
11
+
12
+ def scrub_urls(urls=[])
13
+ formatted_url_hashes = CrmFormatter.format_urls(urls)
14
+ formatted_url_hashes = merge_criteria_hashes(formatted_url_hashes)
15
+ formatted_url_hashes = pre_scrub(formatted_url_hashes)
16
+ end
17
+
18
+ def pre_scrub(hashes)
19
+ hashes = hashes.map do |hsh|
20
+ if hsh[:url_f].present?
21
+ hsh[:url_exts] = extract_exts(hsh)
22
+ hsh = scrub_url_hash(hsh)
23
+ end
24
+ hsh
25
+ end
26
+ end
27
+
28
+ def merge_criteria_hashes(hashes)
29
+ hashes.map! do |url_hash|
30
+ merge_criteria_hash(url_hash)
31
+ end
32
+ end
33
+
34
+ def merge_criteria_hash(url_hash)
35
+ url_hash.merge!(
36
+ {
37
+ url_exts: [],
38
+ neg_exts: [],
39
+ pos_exts: [],
40
+ neg_paths: [],
41
+ pos_paths: [],
42
+ neg_urls: [],
43
+ pos_urls: []
44
+ }
45
+ )
46
+ end
47
+
48
+ def extract_exts(url_hash)
49
+ uri_parts = URI(url_hash[:url_f]).host&.split('.')
50
+ url_exts = uri_parts[2..-1]
51
+ end
52
+
53
+ def scrub_url_hash(url_hash)
54
+ url = url_hash[:url_f]
55
+ path = url_hash[:url_path]
56
+ href = url_hash[:href]
57
+ url_exts = url_hash[:url_exts]
58
+
59
+ url_hash = @filter.scrub_oa(url_hash, url_exts, 'neg_exts', 'equal')
60
+ url_hash = @filter.scrub_oa(url_hash, url_exts, 'pos_exts', 'equal')
61
+ url_hash = @filter.scrub_oa(url_hash, url, 'neg_urls', 'include')
62
+ url_hash = @filter.scrub_oa(url_hash, url, 'pos_urls', 'include')
63
+ url_hash = @filter.scrub_oa(url_hash, path, 'neg_paths', 'include')
64
+ url_hash = @filter.scrub_oa(url_hash, path, 'pos_paths', 'include')
65
+ url_hash
66
+ end
67
+
68
+ end
69
+
70
+ end
data/lib/scrub_db.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "scrub_db/version"
2
- require 'scrub_db/web'
2
+ require 'scrub_db/webs'
3
+ require 'scrub_db/strings'
3
4
  require 'scrub_db/filter'
4
5
  require 'pry'
5
6
  require 'crm_formatter'
@@ -1,8 +1,8 @@
1
- # WebCriteria.new.all_web_criteria
1
+ # WebsCriteria.new.all_scrub_web_criteria
2
2
 
3
- class WebCriteria
3
+ class WebsCriteria
4
4
 
5
- def self.all_web_criteria
5
+ def self.all_scrub_web_criteria
6
6
  {
7
7
  neg_urls: seed_neg_urls,
8
8
  pos_urls: seed_pos_urls,
@@ -46,10 +46,10 @@ class WebCriteria
46
46
  # end
47
47
 
48
48
 
49
- # ##Rails C: StartCrm.run_webs
49
+ # ##Rails C: StartCrm.run_scrub_webs
50
50
  # def self.get_urls
51
51
  # urls = %w(approvedautosales.org autosmartfinance.com leessummitautorepair.net melodytoyota.com northeastacura.com gemmazda.com)
52
- # urls += %w(website.com website.business.site website website.fake website.fake.com website.com.fake)
52
+ # urls += %w(Scrubwebsite.com Scrubwebsite.business.site Scrubwebsite Scrubwebsite.fake Scrubwebsite.fake.com Scrubwebsite.com.fake)
53
53
  # end
54
54
 
55
55
  end
data/scrub_db.gemspec CHANGED
@@ -12,8 +12,8 @@ Gem::Specification.new do |spec|
12
12
  spec.homepage = 'https://github.com/4rlm/scrub_db'
13
13
  spec.license = "MIT"
14
14
 
15
- spec.summary = %q{Scrub data with your custom criteria. Returns detailed reporting.}
16
- spec.description = %q{Scrub data with your custom criteria. Returns detailed reporting. Rspecs coming soon.}
15
+ spec.summary = %q{Scrub your database, api data, web scraping data, and web form submissions based on your your custom criteria. Allows for different criteria for different jobs. Returns detailed reporting to zero-in on your data with ease, efficiency, and greater insight.}
16
+ spec.description = %q{Scrub your database, api data, web scraping data, and web form submissions based on your your custom criteria. Allows for different criteria for different jobs. Returns detailed reporting to zero-in on your data with ease, efficiency, and greater insight. Allows for option to pre-format data before scrubbing to also normalize and standardize your data sets, ex uniform URL patterns}
17
17
 
18
18
  if spec.respond_to?(:metadata)
19
19
  spec.metadata['allowed_push_host'] = 'https://rubygems.org'
@@ -42,7 +42,7 @@ Gem::Specification.new do |spec|
42
42
  # spec.add_dependency "activesupport-inflector", ['~> 0.1.0']
43
43
 
44
44
  spec.add_dependency "utf8_sanitizer", "~> 2.0"
45
- spec.add_dependency "crm_formatter", "~> 2.4"
45
+ spec.add_dependency "crm_formatter", "~> 2.6"
46
46
 
47
47
  spec.add_development_dependency 'bundler', '~> 1.16', '>= 1.16.2'
48
48
  spec.add_development_dependency 'byebug', '~> 10.0', '>= 10.0.2'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrub_db
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1.pre.rc.03
4
+ version: '2.0'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Booth
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-06-27 00:00:00.000000000 Z
11
+ date: 2018-06-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -50,14 +50,14 @@ dependencies:
50
50
  requirements:
51
51
  - - "~>"
52
52
  - !ruby/object:Gem::Version
53
- version: '2.4'
53
+ version: '2.6'
54
54
  type: :runtime
55
55
  prerelease: false
56
56
  version_requirements: !ruby/object:Gem::Requirement
57
57
  requirements:
58
58
  - - "~>"
59
59
  - !ruby/object:Gem::Version
60
- version: '2.4'
60
+ version: '2.6'
61
61
  - !ruby/object:Gem::Dependency
62
62
  name: bundler
63
63
  requirement: !ruby/object:Gem::Requirement
@@ -208,8 +208,11 @@ dependencies:
208
208
  - - "~>"
209
209
  - !ruby/object:Gem::Version
210
210
  version: 0.97.4
211
- description: Scrub data with your custom criteria. Returns detailed reporting. Rspecs
212
- coming soon.
211
+ description: Scrub your database, api data, web scraping data, and web form submissions
212
+ based on your your custom criteria. Allows for different criteria for different
213
+ jobs. Returns detailed reporting to zero-in on your data with ease, efficiency,
214
+ and greater insight. Allows for option to pre-format data before scrubbing to also
215
+ normalize and standardize your data sets, ex uniform URL patterns
213
216
  email:
214
217
  - 4rlm@protonmail.ch
215
218
  executables: []
@@ -218,6 +221,7 @@ extra_rdoc_files: []
218
221
  files:
219
222
  - ".gitignore"
220
223
  - ".rspec"
224
+ - ".rspec_status"
221
225
  - ".travis.yml"
222
226
  - CODE_OF_CONDUCT.md
223
227
  - Gemfile
@@ -226,11 +230,13 @@ files:
226
230
  - Rakefile
227
231
  - bin/console
228
232
  - bin/setup
233
+ - junk.rb
229
234
  - lib/scrub_db.rb
230
235
  - lib/scrub_db/filter.rb
236
+ - lib/scrub_db/strings.rb
231
237
  - lib/scrub_db/version.rb
232
- - lib/scrub_db/web.rb
233
- - lib/web_criteria.rb
238
+ - lib/scrub_db/webs.rb
239
+ - lib/webs_criteria.rb
234
240
  - scrub_db.gemspec
235
241
  homepage: https://github.com/4rlm/scrub_db
236
242
  licenses:
@@ -248,13 +254,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
248
254
  version: 2.5.1
249
255
  required_rubygems_version: !ruby/object:Gem::Requirement
250
256
  requirements:
251
- - - ">"
257
+ - - ">="
252
258
  - !ruby/object:Gem::Version
253
- version: 1.3.1
259
+ version: '0'
254
260
  requirements: []
255
261
  rubyforge_project:
256
262
  rubygems_version: 2.7.6
257
263
  signing_key:
258
264
  specification_version: 4
259
- summary: Scrub data with your custom criteria. Returns detailed reporting.
265
+ summary: Scrub your database, api data, web scraping data, and web form submissions
266
+ based on your your custom criteria. Allows for different criteria for different
267
+ jobs. Returns detailed reporting to zero-in on your data with ease, efficiency,
268
+ and greater insight.
260
269
  test_files: []
data/lib/scrub_db/web.rb DELETED
@@ -1,108 +0,0 @@
1
-
2
-
3
- module ScrubDb
4
- class Web
5
- # attr_accessor :headers, :valid_rows, :encoded_rows, :row_id, :data_hash, :defective_rows, :error_rows
6
-
7
- def initialize(criteria={})
8
- @empty_criteria = criteria&.empty?
9
- @filter = ScrubDb::Filter.new(criteria) unless @empty_criteria
10
- end
11
-
12
- def scrub_urls(urls=[])
13
- formatted_url_hashes = CrmFormatter.format_urls(urls)
14
- formatted_url_hashes = merge_criteria_hashes(formatted_url_hashes)
15
-
16
- formatted_url_hashes.map! do |url_hash|
17
- if url_hash[:web_status] != 'invalid' && url_hash[:url_f].present?
18
- url_hash[:url_exts] = extract_exts(url_hash)
19
- url_hash = scrub_url_hash(url_hash)
20
- end
21
- end
22
- end
23
-
24
- def merge_criteria_hashes(hashes)
25
- hashes.map! do |url_hash|
26
- merge_criteria_hash(url_hash)
27
- end
28
- end
29
-
30
- def merge_criteria_hash(url_hash)
31
- url_hash.merge!(
32
- {
33
- url_exts: [],
34
- neg_exts: [],
35
- pos_exts: [],
36
- neg_paths: [],
37
- pos_paths: [],
38
- neg_urls: [],
39
- pos_urls: []
40
- }
41
- )
42
- end
43
-
44
- def extract_exts(url_hash)
45
- uri_parts = URI(url_hash[:url_f]).host&.split('.')
46
- url_exts = uri_parts[2..-1]
47
- end
48
-
49
- def scrub_url_hash(url_hash)
50
- url = url_hash[:url_f]
51
- path = url_hash[:url_path]
52
- href = url_hash[:href]
53
- url_exts = url_hash[:url_exts]
54
-
55
- url_hash = @filter.scrub_oa(url_hash, url_exts, 'neg_exts', 'equal')
56
- url_hash = @filter.scrub_oa(url_hash, url_exts, 'pos_exts', 'equal')
57
- url_hash = @filter.scrub_oa(url_hash, url, 'neg_urls', 'include')
58
- url_hash = @filter.scrub_oa(url_hash, url, 'pos_urls', 'include')
59
- url_hash = @filter.scrub_oa(url_hash, path, 'neg_paths', 'include')
60
- url_hash = @filter.scrub_oa(url_hash, path, 'pos_paths', 'include')
61
- url_hash
62
- end
63
-
64
- # def remove_invalid_links(link)
65
- # link_hsh = { link: link, valid_link: nil, flags: nil }
66
- # return link_hsh unless link.present?
67
- # @neg_paths += get_symbs
68
- # flags = @neg_paths.select { |red| link&.include?(red) }
69
- # flags << "below #{2}" if link.length < 2
70
- # flags << "over #{100}" if link.length > 100
71
- # flags = flags.flatten.compact
72
- # valid_link = flags.any? ? nil : link
73
- # link_hsh[:valid_link] = valid_link
74
- # link_hsh[:flags] = flags.join(', ')
75
- # binding.pry
76
- # link_hsh
77
- # end
78
-
79
- # def remove_invalid_hrefs(href)
80
- # href_hsh = { href: href, valid_href: nil, flags: nil }
81
- # return href_hsh unless href.present?
82
- # @neg_hrefs += get_symbs
83
- # href = href.split('|').join(' ')
84
- # href = href.split('/').join(' ')
85
- # href&.gsub!('(', ' ')
86
- # href&.gsub!(')', ' ')
87
- # href&.gsub!('[', ' ')
88
- # href&.gsub!(']', ' ')
89
- # href&.gsub!(',', ' ')
90
- # href&.gsub!("'", ' ')
91
- #
92
- # flags = []
93
- # flags << "over #{100}" if href.length > 100
94
- # invalid_text = Regexp.new(/[0-9]/)
95
- # flags << invalid_text&.match(href)
96
- # href = href&.downcase
97
- # href = href&.strip
98
- #
99
- # flags << @neg_hrefs.select { |red| href&.include?(red) }
100
- # flags = flags.flatten.compact.uniq
101
- # href_hsh[:valid_href] = href unless flags.any?
102
- # href_hsh[:flags] = flags.join(', ')
103
- # href_hsh
104
- # end
105
-
106
- end
107
-
108
- end