utf8_sanitizer 0.0.2.pre.rc.01 → 0.0.2.pre.rc.02
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/lib/utf8_sanitizer/csv/extensions.csv +1 -0
- data/lib/utf8_sanitizer/csv/seeds_clean.csv +3074 -0
- data/lib/utf8_sanitizer/csv/seeds_dirty.csv +28407 -0
- data/lib/utf8_sanitizer/csv/seeds_mega.csv +16670 -0
- data/lib/utf8_sanitizer/csv/seeds_mini.csv +100 -0
- data/lib/utf8_sanitizer/csv/seeds_mini_10.csv +11 -0
- data/lib/utf8_sanitizer/csv/seeds_mini_2_bug.csv +3 -0
- data/lib/utf8_sanitizer/run.rb +31 -0
- data/lib/utf8_sanitizer/seed.rb +97 -0
- data/lib/utf8_sanitizer/utf.rb +170 -0
- data/lib/utf8_sanitizer/version.rb +1 -1
- data/lib/utf8_sanitizer.rb +6 -3
- data/utf8_sanitizer.gemspec +2 -2
- metadata +16 -7
@@ -0,0 +1,100 @@
|
|
1
|
+
url,act_name,street,city,state,zip,phone
|
2
|
+
stanleykaufman.net,Stanley Chevrolet Kaufman,825 E Fair St,Kaufman,TX,75142,(888) 457-4391
|
3
|
+
leepartyka.com,Lee Partyka Chevrolet Mazda Isuzu Truck,200 Skiff St,Hamden,CT,6518,(203) 288-7761
|
4
|
+
burienhonda.com,"Honda of Burien
|
5
|
+
cortlandchryslerdodgejeep.com,Cortland Chrysler Dodge Jeep RAM,3878 West Rd,Cortland,NY,13045,(877) 279-3113
|
6
|
+
imperialmotors.net,Imperial Motors,4839 Virginia Beach Blvd,Virginia Beach,VA,23462,(757) 490-3651
|
7
|
+
liatoyotaofnorthampton.com,"Lia Toyota of Northampton
|
8
|
+
nelsonhallchevrolet.com,Nelson Hall Chevrolet,1811 S Frontage Rd,Meridian,MS,39301,(601) 621-4593
|
9
|
+
marshallfordco.com,Marshall Ford Co Inc.,14843 MS-16,Philadelphia,MS,39350,(888) 461-7643
|
10
|
+
warrentontoyota.com,Warrenton Toyota,6449 Lee Hwy,Warrenton,VA,20187,(540) 878-4100
|
11
|
+
toyotacertifiedatcentralcity.com,"Toyota Certified
|
12
|
Central City",4800 Chestnut St,Philadelphia,PA,19139,(888) 379-1155
|
13
|
+
rockcitychrysler.com,Rock City Chrysler Jeep Dodge,520 Rock City St,Little Valley,NY,14755,(866) 414-1024
|
14
|
+
eddinsford.com,Eddins Ford Inc.,2895 S Seminole Trail,Madison,VA,22727,(888) 348-0661
|
15
|
+
lithiasubarufresno.com,Lithia Subaru of Fresno,5499 N Blackstone Ave,Fresno,CA,93710,(888) 453-5359
|
16
|
+
nnchevrolet.com,Northern Neck Chevrolet,18175 Kings Hwy,Montross,VA,22520,(804) 368-6547
|
17
|
+
brewsterhonda.com,"Lia Honda Brewster NY
|
18
|
+
tegelerchevrolet.com,Tegeler Chevrolet,17114 Fordtran Blvd,Industry,TX,78944,(979) 357-2575
|
19
|
+
rochesterhillschryslerjeep.net,Rochester Hills Chrysler Jeep Dodge,1301 S Rochester Rd,Rochester,MI,48307,(248) 841-4094
|
20
|
+
1stkia.com,"First Kia
|
21
|
+
audijax.com,Audi Jacksonville,7230 Blanding Blvd,Jacksonville,FL,32244,(888) 451-7572
|
22
|
+
kalsauto.com,Kal's Auto Sales II,5130 W 8 Mile Rd,Detroit,MI,48234,(313) 891-0000
|
23
|
+
carwashcarsinc.com,Car Wash Cars Inc,462 Route 9W,Glenmont,NY,12077,(518) 729-4317
|
24
|
+
autonationnissanorangepark.com,AutoNation Nissan Orange Park,7447 Blanding Blvd,Jacksonville,FL,32244,(904) 270-9954
|
25
|
+
cfschwartztoyota.com,C. F. Schwartz Toyota,1536 N Dupont Hwy,Dover,DE,19901,(877) 769-6001
|
26
|
+
lhmlexus.com,"Directions to Larry H. Miller Lexus Murray near Salt Lake City, UT
|
27
|
+
agchevy.com,Arroyo Grande Chevrolet,303 Traffic Way,Arroyo Grande,CA,93420,(805) 202-4636
|
28
|
+
billowenscars.com,Bill Owens Auto Sales,705 U.S. 27 SOUTH,Avon Park,FL,33825,(863) 452-5599
|
29
|
+
damascusmotors.com,Damascus Motors,26100 Woodfield Rd,Damascus,MD,20872,(301) 253-2151
|
30
|
+
dennisdillonchryslerjeepdodge.com,Dennis Dillon Chrysler Jeep Dodge,4025 Cleveland Blvd,Caldwell,ID,83605,(888) 506-8191
|
31
|
+
bobhowardhonda.com,Bob Howard Honda,13201 N Kelley Ave,Oklahoma City,OK,73131,(405) 753-8700
|
32
|
+
humboldtdodgechryslerjeep.com,Lonnie Cobb's Humboldt Chrysler Dodge Jeep RAM,3301 Eastend Dr,Humboldt,TN,38343,(888) 554-6047
|
33
|
+
livingstonvw.com,Livingston Volkswagen,21141 Ventura Blvd,Woodland Hills,CA,91364,(855) 419-9290
|
34
|
+
beckmastennorth.com,Beck and Masten North,11300 FM 1960,Houston,TX,77065,(281) 469-5222
|
35
|
+
fairbanksnissan.com,Fairbanks Nissan,2610 S Cushman St,Fairbanks,AK,99701,(866) 467-8690
|
36
|
+
feyerlincoln.net,Feyer Lincoln Inc.,1677 US-17 BUS,Williamston,NC,27892,(866) 493-1351
|
37
|
+
forrestersdodgecity.com,Forrester's Dodge City,204 S Walnut St,Arnold,NE,69120,(888) 463-0311
|
38
|
+
victorchevrolet.com,Victor Chevrolet - A Rochester Area Dealer,7200 Pittsford Victor Rd,Victor,NY,14564,(585) 433-2500
|
39
|
+
landroverpalmbeach.com,Land Rover Palm Beach,7550 Okeechobee Blvd,West Palm Beach,FL,33411,(561) 209-7000
|
40
|
+
goudyhonda.com,Goudy Honda,1400 W Main St,Alhambra,CA,91801,(626) 576-1114
|
41
|
+
davis-olaughlin.com,Davis O'Laughlin Buick GMC Cadillac,4580 US-219,Great Valley,NY,14741,(716) 945-5050
|
42
|
+
lutherhondaofstcloud.com,Luther St. Cloud Honda,1500 Hwy 23 West,Waite Park,MN,56387,(888) 453-5650
|
43
|
+
titanautosales.com,Titan Auto Sales LLC,1037 Central Ave,Albany,NY,12205,(518) 438-0101
|
44
|
+
hixsonbmw.com,Hixson BMW Monroe,1201 Louisville Ave,Monroe,LA,71201,(855) 519-8884
|
45
|
+
antwerpenautoworld.com,Antwerpen Auto World,9400 Liberty Road,Randallstown,MD,21133,(888) 437-9880
|
46
|
+
lakelandcdj.com,Lakeland Chrysler Dodge Jeep,2875 Mall Hill Drive,Lakeland,FL,33810,(888) 420-1648
|
47
|
+
blakegreenfieldchevroletbuick.com,Blake Greenfield Chevrolet Buick,150 3rd St NE,Wells,MN,56097,(507) 774-0365
|
48
|
+
carltonmb.com,Carlton Motorcars,2446 Laurens Rd,Greenville,SC,29607,(855) 336-9760
|
49
|
+
silkohonda.com,Silko Honda,1580 New State Hwy,Raynham,MA,2767,(508) 880-5500
|
50
|
+
lahabramotors.com,La Habra Motors,980 E Whittier Blvd,La Habra,CA,90631,(562) 697-2400
|
51
|
+
hugoautosales.com,Hugo Auto Sales,2583 Cameron Langston Rd,Grifton,NC,28530,(252) 522-4140
|
52
|
+
baxterfordsouth.com,Baxter Ford South,9203 S 145th St,Omaha,NE,68138,(855) 985-0071
|
53
|
+
roemotorsny.com,Roe Motors Ltd,662 Montauk Hwy,Shirley,NY,11967,(631) 395-2277
|
54
|
+
kingmontclair.com,King Auto Sales,5053 Mission Blvd,Montclair,CA,91763,(909) 627-1211
|
55
|
+
douglasinfiniti.com,Douglas Infiniti,430 Morris Ave,Summit,NJ,7901,(908) 522-7300
|
56
|
+
plazainfiniti.com,Plaza Infiniti,755 N New Ballas Rd,Creve Coeur,MO,63141,(888) 593-1881
|
57
|
+
coughlinmarysvillechrysler.com,Coughlin Chrysler Jeep Dodge RAM,15777 Watkins Rd,Marysville,OH,43040,(888) 378-6128
|
58
|
+
dillonlakemtrs.com,Dillon Lake Motors,3350 Maple Ave,Zanesville,OH,43701,(740) 297-7265
|
59
|
+
chapmanspeedway.com,Chapman Used Cars on Speedway,6001 E Speedway Blvd,Tucson,AZ,85712,(520) 829-0032
|
60
|
+
beckchevroletonline.com,"Beck Chevrolet Co., Inc.",561 Central Park Ave,Yonkers,NY,10704,(914) 595-1463
|
61
|
+
infinitimv.com,INFINITI of Mission Viejo,28471 Marguerite Pkwy,Mission Viejo,CA,92692,(855) 410-3445
|
62
|
+
hondaofserramonte.com,Honda of Serramonte,485 Serramonte Blvd,Colma,CA,94014,(888) 892-5461
|
63
|
+
douglassnissanofwaco.com,Douglass Nissan of Waco,5605 Legend Lake Pkwy,Waco,TX,76712,(877) 302-5270
|
64
|
+
saturnofknoxville.com,McManus Auto Sales,6404 Maynardville Pike,Knoxville,TN,37918,(865) 281-2278
|
65
|
+
avchevy.com,Antelope Valley Chevrolet,1160 Motor Ln,Lancaster,CA,93534,(661) 952-2300
|
66
|
+
skylineford.net,Skyline Ford,2510 Commercial St SE,Salem,OR,97302,(888) 873-7584
|
67
|
+
moseshuntingtongm.com,Moses Buick GMC Cadillac,5200 Route 60 East,Huntington,WV,25705,(304) 736-5291
|
68
|
+
hatchtoyota.com,2017 Hatch Toyota.,1051 Automall Pkwy,Show Low,AZ,85901,(928) 242-9007
|
69
|
+
malouffordinc.com,Malouf Ford,2210 US-1,North Brunswick Township,NJ,8902,(866) 951-8097
|
70
|
+
seminolefordok.com,Seminole Ford,2222 N Milt Phillips Ave,Seminole,OK,74868,(888) 903-5643
|
71
|
+
flagshipmotorcars.com,Flagship Motorcars of Lynnfield,385 Broadway,Lynnfield,MA,1940,(877) 907-1967
|
72
|
+
capitolhyundai.com,Capitol Hyundai Columbia,101 Newland Rd,Columbia,SC,29229,(800) 258-1436
|
73
|
+
searsimports.com,"Sears Imported Autos, Inc.",13500 Wayzata Blvd,Minnetonka,MN,55305,(888) 580-8791
|
74
|
+
videon.com,Videon Chrysler Dodge Jeep RAM,4951 West Chester Pike,Newtown Square,PA,19073,(610) 356-7000
|
75
|
+
towncountrychryslerdodge.com,Town & Country Chrysler Dodge Ram,703 Crim Ave,Belington,WV,26250,(877) 830-9359
|
76
|
+
palmharborhonda.com,Courtesy Palm Harbor Honda,31200 US Hwy 19 N,Palm Harbor,FL,34684,(866) 423-6155
|
77
|
+
bravolascruces.com,Bravo Chevrolet Cadillac,1601 S Main St,Las Cruces,NM,88005,(575) 527-3800
|
78
|
+
ricksautosite.com,Rick's Auto Sales Used Cars,1536 E 5th St,Metropolis,IL,62960,(618) 524-7366
|
79
|
+
audiofhuntington.com,Audi of Huntington,363 E Jericho Turnpike,Huntington Station,NY,11746,(866) 455-0268
|
80
|
+
discountautocorner.com,Discount Auto,2908 Lincoln Hwy,Langhorne,PA,19047,(215) 757-5170
|
81
|
+
audipetoskey.com,Audi Petoskey,825 Charlevoix Ave,Petoskey,MI,49770,(888) 478-1060
|
82
|
+
vwoforchardpark.com,Volkswagen of Orchard Park,3524 Southwestern Blvd,Orchard Park,NY,14127,(716) 216-0256
|
83
|
+
pietroske.com,Pietroske Chevrolet,4000 Grand Avenue,Manitowoc,WI,54221,(920) 684-0224
|
84
|
+
hendrickhyundainorth.com,Hendrick Hyundai North Charleston,8485 Rivers Ave,North Charleston,SC,29406,(866) 542-1320
|
85
|
+
johnsgreatcars.com,John's Great Cars,1133 Lancaster Ave,Reading,PA,19607,(610) 603-9100
|
86
|
+
beckmotors.net,Beck Motors,305 US Hwy 63,Freeburg,MO,65035,(888) 638-4505
|
87
|
+
parksmotorsales.com,Parks Motor Sales Inc.,919 Nashville Hwy,Columbia,TN,38401,(931) 388-5843
|
88
|
+
josephbuickgmc.com,Joseph Buick GMC,8700 Colerain Ave,Groesbeck,OH,45251,(513) 741-1000
|
89
|
+
centralgmcnorwood.com,Central Buick GMC of Norwood,70 Boston Providence Hwy,Norwood,MA,2062,(781) 473-0908
|
90
|
+
cruisinclassicsinc.com,Cruisin Classics Auto Sales,3575 Fisher Rd,Columbus,OH,43228,(614) 276-7355
|
91
|
+
everettchryslerjeepdodgeram.com,Everett Chrysler Dodge Jeep Ram,3709 S Thompson St,Springdale,AR,72764,(888) 257-0465
|
92
|
+
garavelsubaru.com,Garavel Subaru,10 Tindall Ave,Norwalk,CT,6851,(877) 223-0913
|
93
|
+
jansenchevrolet.com,Jansen Chevrolet,7801 IL-161,Germantown,IL,62245,(618) 613-4215
|
94
|
+
bramanbmwjupiter.com,Braman BMW of Jupiter,1555 W Indiantown Rd,Jupiter,FL,33458,(561) 609-0144
|
95
|
+
mildenbergermotors.com,Mildenberger Motors,1717 N 1st St,Hamilton,MT,59840,(406) 626-3050
|
96
|
+
autonationfordarlington.com,AutoNation Ford Arlington,1400 W Interstate 20,Arlington,TX,76017,(817) 200-4503
|
97
|
+
rampmotors.com,Ramp Motors,4869 Nesconset Hwy,Port Jefferson Station,NY,11776,(631) 473-1550
|
98
|
+
millerchevy.com,Miller & Sons Chevrolet Buick,3107 Green Garden Rd,Aliquippa,PA,15001,(724) 378-0541
|
99
|
+
cjsautosales.com,CJS Auto Sales,2509 S Houston Rd,Pasadena,TX,77502,(713) 947-6405
|
100
|
+
redwingchev.net,Red Wing Chevrolet Buick Cadillac,2500 US Hwy 61,Red Wing,MN,55066,(651) 388-4777
|
101
|
+
sunnyvalevw.com,Sunnyvale Volkswagen,1025 E El Camino Real,Sunnyvale,CA,94087,(877) 785-8252
|
@@ -0,0 +1,11 @@
|
|
1
|
+
url,act_name,street,city,state,zip,phone
|
2
|
+
stanleykaufman.net,Stanley Chevrolet Kaufman,825 E Fair St,Kaufman,TX,75142,(888) 457-4391
|
3
|
+
leepartyka.com,Lee Partyka Chevrolet Mazda Isuzu Truck,200 Skiff St,Hamden,CT,6518,(203) 288-7761
|
4
|
+
burienhonda.com,"Honda of Burien
|
5
|
+
cortlandchryslerdodgejeep.com,Cortland Chrysler Dodge Jeep RAM,3878 West Rd,Cortland,NY,13045,(877) 279-3113
|
6
|
+
imperialmotors.net,Imperial Motors,4839 Virginia Beach Blvd,Virginia Beach,VA,23462,(757) 490-3651
|
7
|
+
liatoyotaofnorthampton.com,"Lia Toyota of Northampton
|
8
|
+
nelsonhallchevrolet.com,Nelson Hall Chevrolet,1811 S Frontage Rd,Meridian,MS,39301,(601) 621-4593
|
9
|
+
marshallfordco.com,Marshall Ford Co Inc.,14843 MS-16,Philadelphia,MS,39350,(888) 461-7643
|
10
|
+
warrentontoyota.com,Warrenton Toyota,6449 Lee Hwy,Warrenton,VA,20187,(540) 878-4100
|
11
|
+
toyotacertifiedatcentralcity.com,"Toyota Certified
|
12
|
Central City",4800 Chestnut St,Philadelphia,PA,19139,(888) 379-1155
|
@@ -0,0 +1,31 @@
|
|
1
|
+
|
2
|
+
module Utf8Sanitizer
|
3
|
+
class Run
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@crm_data = {}
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
def import(args={})
|
11
|
+
@crm_data = { stats: nil, data: nil, file_path: nil, criteria: nil }
|
12
|
+
@crm_data.merge!(args)
|
13
|
+
keys = args.compact.keys
|
14
|
+
|
15
|
+
unless (keys & [:data, :file_path]).any?
|
16
|
+
@crm_data[:file_path] = Seed.new.grab_seed_file_path
|
17
|
+
# @crm_data[:data] = Seed.new.grab_seed_hashes
|
18
|
+
@crm_data[:pollute_seeds] = true
|
19
|
+
unless keys.include?(:criteria)
|
20
|
+
@crm_data[:criteria] = Seed.new.grab_seed_web_criteria
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
utf_result = Utf8Sanitizer::UTF.new.validate_data(@crm_data)
|
25
|
+
@crm_data.merge!(utf_result)
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module Utf8Sanitizer
|
4
|
+
class Seed
|
5
|
+
def initialize(args={})
|
6
|
+
# @pollute_seeds = args.fetch(:pollute_seeds, false)
|
7
|
+
# @seed_hashes = args.fetch(:seed_hashes, false)
|
8
|
+
# @seed_csv = args.fetch(:seed_csv, false)
|
9
|
+
end
|
10
|
+
|
11
|
+
def pollute_seeds(text)
|
12
|
+
list = ['h∑', 'lÔ', "\x92", "\x98", "\x99", "\xC0", "\xC1", "\xC2", "\xCC", "\xDD", "\xE5", "\xF8"]
|
13
|
+
index = text.length / 2
|
14
|
+
var = "#{list.sample}_#{list.sample}"
|
15
|
+
text.insert(index, var)
|
16
|
+
text.insert(-1, "\r\n")
|
17
|
+
text
|
18
|
+
end
|
19
|
+
|
20
|
+
def grab_seed_file_path
|
21
|
+
# "./lib/utf8_sanitizer/csv/seeds_clean.csv"
|
22
|
+
# "./lib/utf8_sanitizer/csv/seeds_dirty.csv"
|
23
|
+
# "./lib/utf8_sanitizer/csv/seeds_mega.csv"
|
24
|
+
# "./lib/utf8_sanitizer/csv/seeds_mini.csv"
|
25
|
+
# "./lib/utf8_sanitizer/csv/seeds_mini_10.csv"
|
26
|
+
'./lib/utf8_sanitizer/csv/seeds_mini_2_bug.csv'
|
27
|
+
end
|
28
|
+
|
29
|
+
### Sample Hashes for validate_data
|
30
|
+
def grab_seed_hashes
|
31
|
+
[{ row_id: 1,
|
32
|
+
url: 'stanleykaufman.com',
|
33
|
+
act_name: 'Stanley Chevrolet Kaufman',
|
34
|
+
street: '825 E Fair St',
|
35
|
+
city: 'Kaufman',
|
36
|
+
state: 'TX',
|
37
|
+
zip: '75142',
|
38
|
+
phone: '(888) 457-4391' },
|
39
|
+
{ row_id: 2,
|
40
|
+
url: 'leepartyka',
|
41
|
+
act_name: 'Lee Partyka Chevrolet Mazda Isuzu Truck',
|
42
|
+
street: '200 Skiff St',
|
43
|
+
city: 'Hamden',
|
44
|
+
state: 'CT',
|
45
|
+
zip: '6518',
|
46
|
+
phone: '(203) 288-7761' },
|
47
|
+
{ row_id: 3,
|
48
|
+
url: 'burienhonda.fake.not.net.com',
|
49
|
+
act_name: 'Honda of Burien 15026 1st Avenue South, Burien, WA 98148',
|
50
|
+
street: '15026 1st Avenue South',
|
51
|
+
city: 'Burien',
|
52
|
+
state: 'WA',
|
53
|
+
zip: '98148',
|
54
|
+
phone: '(206) 246-9700' },
|
55
|
+
{ row_id: 4,
|
56
|
+
url: 'cortlandchryslerdodgejeep.com',
|
57
|
+
act_name: 'Cortland Chrysler Dodge Jeep RAM',
|
58
|
+
street: '3878 West Rd',
|
59
|
+
city: 'Cortland',
|
60
|
+
state: 'NY',
|
61
|
+
zip: '13045',
|
62
|
+
phone: '(877) 279-3113' },
|
63
|
+
{ row_id: 5,
|
64
|
+
url: 'imperialmotors.net',
|
65
|
+
act_name: 'Imperial Motors',
|
66
|
+
street: '4839 Virginia Beach Blvd',
|
67
|
+
city: 'Virginia Beach',
|
68
|
+
state: 'VA',
|
69
|
+
zip: '23462',
|
70
|
+
phone: '(757) 490-3651' }]
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
|
75
|
+
def grab_seed_web_criteria
|
76
|
+
pos_urls, neg_urls, neg_links, neg_hrefs, neg_exts = [], [], [], [], []
|
77
|
+
|
78
|
+
neg_urls = %w[approv avis budget collis eat enterprise facebook financ food google gourmet hertz hotel hyatt insur invest loan lube mobility motel motorola parts quick rent repair restaur rv ryder service softwar travel twitter webhost yellowpages yelp youtube]
|
79
|
+
|
80
|
+
pos_urls = ['acura', 'alfa romeo', 'aston martin', 'audi', 'bmw', 'bentley', 'bugatti', 'buick', 'cdjr', 'cadillac', 'chevrolet', 'chrysler', 'dodge', 'ferrari', 'fiat', 'ford', 'gmc', 'group', 'group', 'honda', 'hummer', 'hyundai', 'infiniti', 'isuzu', 'jaguar', 'jeep', 'kia', 'lamborghini', 'lexus', 'lincoln', 'lotus', 'mini', 'maserati', 'mazda', 'mclaren', 'mercedes-benz', 'mitsubishi', 'nissan', 'porsche', 'ram', 'rolls-royce', 'saab', 'scion', 'smart', 'subaru', 'suzuki', 'toyota', 'volkswagen', 'volvo']
|
81
|
+
|
82
|
+
# neg_links = %w(: .biz .co .edu .gov .jpg .net // afri anounc book business buy bye call cash cheap click collis cont distrib download drop event face feature feed financ find fleet form gas generat graphic hello home hospi hour hours http info insta inventory item join login mail mailto mobile movie museu music news none offer part phone policy priva pump rate regist review schedul school service shop site test ticket tire tv twitter watch www yelp youth)
|
83
|
+
|
84
|
+
# neg_hrefs = %w(? .com .jpg @ * afri after anounc apply approved blog book business buy call care career cash charit cheap check click collis commerc cont contrib deal distrib download employ event face feature feed financ find fleet form gas generat golf here holiday hospi hour info insta inventory join later light login mail mobile movie museu music news none now oil part pay phone policy priva pump quick quote rate regist review saving schedul service shop sign site speci ticket tire today transla travel truck tv twitter watch youth)
|
85
|
+
|
86
|
+
neg_exts = %w[au ca edu es gov in ru uk us]
|
87
|
+
|
88
|
+
oa_args = { pos_urls: pos_urls, neg_urls: neg_urls, neg_links: neg_links, neg_hrefs: neg_hrefs, neg_exts: neg_exts }
|
89
|
+
oa_args.compact
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
# frozen_string_literal: false
|
2
|
+
# require 'csv'
|
3
|
+
|
4
|
+
module Utf8Sanitizer
|
5
|
+
class UTF
|
6
|
+
def initialize(args={})
|
7
|
+
@utf_result = { stats: {}, data: {} }
|
8
|
+
@valid_rows = []
|
9
|
+
@encoded_rows = []
|
10
|
+
@defective_rows = []
|
11
|
+
@error_rows = []
|
12
|
+
@headers = []
|
13
|
+
@row_id = 0
|
14
|
+
@data_hash = {}
|
15
|
+
end
|
16
|
+
|
17
|
+
#################### * VALIDATE DATA * ####################
|
18
|
+
def validate_data(args={})
|
19
|
+
args = args.slice(:file_path, :data, :pollute_seeds)
|
20
|
+
args = args.compact
|
21
|
+
|
22
|
+
@seed = Seed.new if args.fetch(:pollute_seeds)
|
23
|
+
file_path = args[:file_path]
|
24
|
+
data = args[:data]
|
25
|
+
|
26
|
+
utf_result = validate_csv(file_path) if file_path
|
27
|
+
utf_result = validate_hashes(data) if data
|
28
|
+
utf_result
|
29
|
+
end
|
30
|
+
|
31
|
+
#################### * COMPILE RESULTS * ####################
|
32
|
+
def compile_results
|
33
|
+
utf_status = @valid_rows.map { |hsh| hsh[:utf_status] }
|
34
|
+
mapped_details = utf_status.map { |str| str.split(', ') }.flatten.compact
|
35
|
+
groups = make_groups_from_array(mapped_details)
|
36
|
+
wchar = groups['wchar']
|
37
|
+
perfect = groups['perfect']
|
38
|
+
|
39
|
+
header_row_count = @headers.any? ? 1 : 0
|
40
|
+
stats = { total_rows: @row_id, header_row: header_row_count, valid_rows: @valid_rows.count, error_rows: @error_rows.count, defective_rows: @defective_rows.count, perfect_rows: perfect, encoded_rows: @encoded_rows.count, wchar_rows: wchar }
|
41
|
+
data = { valid_data: @valid_rows, encoded_data: @encoded_rows, defective_data: @defective_rows, error_data: @error_rows }
|
42
|
+
@utf_result = { stats: stats, data: data }
|
43
|
+
utf_result = @utf_result
|
44
|
+
initialize
|
45
|
+
utf_result
|
46
|
+
end
|
47
|
+
|
48
|
+
#################### * VALIDATE CSV * ####################
|
49
|
+
def validate_csv(file_path)
|
50
|
+
return unless file_path.present?
|
51
|
+
File.open(file_path).each do |file_line|
|
52
|
+
validated_line = utf_filter(check_utf(file_line))
|
53
|
+
@row_id += 1
|
54
|
+
if validated_line
|
55
|
+
CSV.parse(validated_line) do |row|
|
56
|
+
if @headers.empty?
|
57
|
+
@headers = row
|
58
|
+
else
|
59
|
+
@data_hash.merge!(row_to_hsh(row))
|
60
|
+
@valid_rows << @data_hash
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
rescue StandardError => error
|
65
|
+
@error_rows << { row_id: @row_id, text: error.message }
|
66
|
+
end
|
67
|
+
compile_results
|
68
|
+
end
|
69
|
+
|
70
|
+
#################### * VALIDATE HASHES * ####################
|
71
|
+
def validate_hashes(orig_hashes)
|
72
|
+
return unless orig_hashes.present?
|
73
|
+
begin
|
74
|
+
process_hash_row(orig_hashes.first) ## re keys for headers.
|
75
|
+
orig_hashes.each { |hsh| process_hash_row(hsh) } ## re values
|
76
|
+
rescue StandardError => error
|
77
|
+
@error_rows << { row_id: @row_id, text: error.message }
|
78
|
+
end
|
79
|
+
compile_results ## handles returns.
|
80
|
+
end
|
81
|
+
|
82
|
+
### process_hash_row - helper VALIDATE HASHES ###
|
83
|
+
### Converts hash keys and vals into parsed line.
|
84
|
+
def process_hash_row(hsh)
|
85
|
+
if @headers.any?
|
86
|
+
keys_or_values = hsh.values
|
87
|
+
@row_id = hsh[:row_id]
|
88
|
+
else
|
89
|
+
keys_or_values = hsh.keys.map(&:to_s)
|
90
|
+
end
|
91
|
+
|
92
|
+
file_line = keys_or_values.join(',')
|
93
|
+
line_parse(utf_filter(check_utf(file_line)))
|
94
|
+
end
|
95
|
+
|
96
|
+
### line_parse - helper VALIDATE HASHES ###
|
97
|
+
### Parses line to row, then updates final results.
|
98
|
+
def line_parse(validated_line)
|
99
|
+
return unless validated_line
|
100
|
+
row = validated_line.split(',')
|
101
|
+
return unless row.any?
|
102
|
+
if @headers.empty?
|
103
|
+
@headers = row
|
104
|
+
else
|
105
|
+
@data_hash.merge!(row_to_hsh(row))
|
106
|
+
@valid_rows << @data_hash
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
#################### * CHECK UTF * ####################
|
111
|
+
def check_utf(text)
|
112
|
+
return unless text.present?
|
113
|
+
text = @seed.pollute_seeds(text) if @seed && @headers.any?
|
114
|
+
results = { text: text, encoded: nil, wchar: nil, error: nil }
|
115
|
+
begin
|
116
|
+
if !text.valid_encoding?
|
117
|
+
encoded = text.chars.select(&:valid_encoding?).join
|
118
|
+
encoded.delete!('_')
|
119
|
+
encoded = encoded.delete("^\u{0000}-\u{007F}")
|
120
|
+
else
|
121
|
+
encoded = text.delete("^\u{0000}-\u{007F}")
|
122
|
+
end
|
123
|
+
wchar = encoded&.gsub(/\s+/, ' ')&.strip
|
124
|
+
results[:encoded] = encoded if text != encoded
|
125
|
+
results[:wchar] = wchar if encoded != wchar
|
126
|
+
rescue StandardError => error
|
127
|
+
results[:error] = error.message if error
|
128
|
+
end
|
129
|
+
results
|
130
|
+
end
|
131
|
+
|
132
|
+
#################### * UTF FILTER * ####################
|
133
|
+
def utf_filter(utf)
|
134
|
+
return unless utf.present?
|
135
|
+
puts utf.inspect
|
136
|
+
utf_status = utf.except(:text).compact.keys
|
137
|
+
utf_status = utf_status&.map(&:to_s)&.join(', ')
|
138
|
+
utf_status = 'perfect' if utf_status.blank?
|
139
|
+
|
140
|
+
encoded = utf[:text] if utf[:encoded]
|
141
|
+
error = utf[:error]
|
142
|
+
line = utf.except(:error).compact.values.last unless error
|
143
|
+
data_hash = { row_id: @row_id, utf_status: utf_status }
|
144
|
+
|
145
|
+
@encoded_rows << { row_id: @row_id, text: encoded } if encoded
|
146
|
+
@error_rows << { row_id: @row_id, text: error } if error
|
147
|
+
@defective_rows << filt_utf_hsh[:text] if error
|
148
|
+
@data_hash = data_hash if @data_hash[:row_id] != @row_id
|
149
|
+
line
|
150
|
+
end
|
151
|
+
|
152
|
+
############# !! HELPERS BELOW !! #############
|
153
|
+
############# KEY VALUE CONVERTERS #############
|
154
|
+
def row_to_hsh(row)
|
155
|
+
h = Hash[@headers.zip(row)]
|
156
|
+
h.symbolize_keys
|
157
|
+
end
|
158
|
+
|
159
|
+
def val_hsh(cols, hsh)
|
160
|
+
keys = hsh.keys
|
161
|
+
keys.each { |key| hsh.delete(key) unless cols.include?(key) }
|
162
|
+
hsh
|
163
|
+
end
|
164
|
+
|
165
|
+
def make_groups_from_array(array)
|
166
|
+
array.each_with_object(Hash.new(0)) { |e, h| h[e] += 1; }
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
170
|
+
end
|
data/lib/utf8_sanitizer.rb
CHANGED
@@ -1,11 +1,14 @@
|
|
1
1
|
require "utf8_sanitizer/version"
|
2
|
+
require 'utf8_sanitizer/run'
|
3
|
+
require 'utf8_sanitizer/seed'
|
4
|
+
require 'utf8_sanitizer/utf'
|
5
|
+
require 'pry'
|
2
6
|
|
3
7
|
module Utf8Sanitizer
|
4
8
|
|
5
9
|
def self.run_wrap
|
6
|
-
|
7
|
-
|
8
|
-
# wrap.run_wrap ## returns formatted urls.
|
10
|
+
run = self::Run.new
|
11
|
+
result = run.import ## returns formatted urls.
|
9
12
|
end
|
10
13
|
|
11
14
|
end
|
data/utf8_sanitizer.gemspec
CHANGED
@@ -14,8 +14,8 @@ Gem::Specification.new do |spec|
|
|
14
14
|
spec.homepage = 'https://github.com/4rlm/utf8_sanitizer'
|
15
15
|
spec.license = 'MIT'
|
16
16
|
|
17
|
-
spec.summary = "Still in
|
18
|
-
spec.description = "Removes invalid UTF8 characters, and extra whitespace (carriage returns, new lines, tabs, spaces, etc.) from csv, or strings.\n Example: ABC Au\\xC1tos,123 E Main St,Anytown,TX,75142,(888) 555-1234\\n\\r\\n
|
17
|
+
spec.summary = "Still in BETA: Removes invalid UTF8 characters, and extra whitespace (carriage returns, new lines, tabs, spaces, etc.) from csv, or strings."
|
18
|
+
spec.description = "Still in BETA: Removes invalid UTF8 characters, and extra whitespace (carriage returns, new lines, tabs, spaces, etc.) from csv, or strings.\n Example: ABC Au\\xC1tos,123 E Main St,Anytown,TX,75142,(888) 555-1234\\n\\r\\n => ABC Autos,123 E Main St,Anytown,TX,75142,(888) 555-1234"
|
19
19
|
|
20
20
|
if spec.respond_to?(:metadata)
|
21
21
|
spec.metadata['allowed_push_host'] = "https://rubygems.org"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8_sanitizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.2.pre.rc.
|
4
|
+
version: 0.0.2.pre.rc.02
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Booth
|
@@ -180,10 +180,9 @@ dependencies:
|
|
180
180
|
- - "~>"
|
181
181
|
- !ruby/object:Gem::Version
|
182
182
|
version: 0.11.3
|
183
|
-
description:
|
184
|
-
new lines, tabs, spaces, etc.) from csv, or strings
|
185
|
-
|
186
|
-
555-1234"
|
183
|
+
description: |-
|
184
|
+
Still in BETA: Removes invalid UTF8 characters, and extra whitespace (carriage returns, new lines, tabs, spaces, etc.) from csv, or strings.
|
185
|
+
Example: ABC Au\xC1tos,123 E Main St,Anytown,TX,75142,(888) 555-1234\n\r\n => ABC Autos,123 E Main St,Anytown,TX,75142,(888) 555-1234
|
187
186
|
email:
|
188
187
|
- 4rlm@protonmail.ch
|
189
188
|
executables: []
|
@@ -201,6 +200,16 @@ files:
|
|
201
200
|
- bin/console
|
202
201
|
- bin/setup
|
203
202
|
- lib/utf8_sanitizer.rb
|
203
|
+
- lib/utf8_sanitizer/csv/extensions.csv
|
204
|
+
- lib/utf8_sanitizer/csv/seeds_clean.csv
|
205
|
+
- lib/utf8_sanitizer/csv/seeds_dirty.csv
|
206
|
+
- lib/utf8_sanitizer/csv/seeds_mega.csv
|
207
|
+
- lib/utf8_sanitizer/csv/seeds_mini.csv
|
208
|
+
- lib/utf8_sanitizer/csv/seeds_mini_10.csv
|
209
|
+
- lib/utf8_sanitizer/csv/seeds_mini_2_bug.csv
|
210
|
+
- lib/utf8_sanitizer/run.rb
|
211
|
+
- lib/utf8_sanitizer/seed.rb
|
212
|
+
- lib/utf8_sanitizer/utf.rb
|
204
213
|
- lib/utf8_sanitizer/version.rb
|
205
214
|
- utf8_sanitizer.gemspec
|
206
215
|
- utf8_sanitizer_gemspec_orig.txt
|
@@ -228,6 +237,6 @@ rubyforge_project:
|
|
228
237
|
rubygems_version: 2.7.6
|
229
238
|
signing_key:
|
230
239
|
specification_version: 4
|
231
|
-
summary: 'Still in
|
232
|
-
|
240
|
+
summary: 'Still in BETA: Removes invalid UTF8 characters, and extra whitespace (carriage
|
241
|
+
returns, new lines, tabs, spaces, etc.) from csv, or strings.'
|
233
242
|
test_files: []
|