crm_formatter 1.0.7.pre.rc.1 → 2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -2
- data/.rspec_status +7 -0
- data/.rubocop.yml +10 -0
- data/.rubocop_todo.yml +188 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +21 -0
- data/README.md +202 -145
- data/Rakefile +75 -5
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/crm_formatter.gemspec +59 -13
- data/lib/crm_formatter.rb +46 -3
- data/lib/crm_formatter/address.rb +143 -122
- data/lib/crm_formatter/{extensions.csv → csv/extensions.csv} +0 -0
- data/lib/crm_formatter/csv/seed.csv +2 -0
- data/lib/crm_formatter/phone.rb +28 -20
- data/lib/crm_formatter/version.rb +4 -2
- data/lib/crm_formatter/web.rb +115 -248
- data/lib/crm_formatter/wrap.rb +54 -0
- data/menu.rb +3 -1
- data/non_utf8_examples.txt +40 -0
- data/result.txt +964 -0
- data/rubocop.json +1 -0
- metadata +211 -16
- data/bin/crm_formatter +0 -4
- data/gem_notes_crm_formatter.txt +0 -138
- data/lib/crm_formatter/helpers.rb +0 -23
File without changes
|
data/lib/crm_formatter/phone.rb
CHANGED
@@ -1,38 +1,46 @@
|
|
1
|
-
|
2
|
-
class Phone
|
1
|
+
# frozen_string_literal: false
|
3
2
|
|
4
|
-
|
3
|
+
module CrmFormatter
|
4
|
+
class Phone
|
5
|
+
## Checks every phone number in table to verify that it meets phone criteria, then calls format_phone method to wrap Valid results. Otherwise destroys Invalid phone fields and associations.
|
5
6
|
|
6
|
-
# Call:
|
7
|
+
# Call: Wrap.new.validate_phone(phone)
|
7
8
|
def validate_phone(phone)
|
8
|
-
phone_hsh = {
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
end
|
9
|
+
phone_hsh = { phone_status: nil, phone: phone, phone_f: nil }
|
10
|
+
return phone_hsh unless phone.present?
|
11
|
+
phone = phone&.gsub(/\s/, ' ')&.strip
|
12
|
+
reg = Regexp.new('[(]?[0-9]{3}[ ]?[)-.]?[ ]?[0-9]{3}[ ]?[-. ][ ]?[0-9]{4}')
|
13
|
+
phone = nil if phone.first == '0' || phone.include?('(0') || !reg.match(phone)
|
14
|
+
phone_hsh[:phone_f] = format_phone(phone) if phone.present?
|
15
|
+
phone_hsh = check_phone_status(phone_hsh)
|
16
16
|
phone_hsh
|
17
17
|
end
|
18
18
|
|
19
|
+
####### COMPARE ORIGINAL AND FORMATTED PHONE ######
|
20
|
+
def check_phone_status(hsh)
|
21
|
+
phone = hsh[:phone]
|
22
|
+
phone_f = hsh[:phone_f]
|
23
|
+
status = 'invalid'
|
24
|
+
status = phone != phone_f ? 'formatted' : 'unchanged' if phone && phone_f
|
25
|
+
hsh[:phone_status] = status if status.present?
|
26
|
+
hsh
|
27
|
+
end
|
28
|
+
|
19
29
|
#################################
|
20
30
|
## FORMATS PHONE AS: (000) 000-0000
|
21
31
|
## Assumes phone is legitimate, then formats. Not designed to detect Valid phone number.
|
22
32
|
|
23
|
-
# Call:
|
33
|
+
# Call: Wrap.new.format_phone(phone)
|
24
34
|
def format_phone(phone)
|
25
|
-
regex = Regexp.new(
|
26
|
-
if !phone.blank? && (phone !=
|
27
|
-
phone_stripped = phone.gsub(/[^0-9]/,
|
28
|
-
|
29
|
-
|
35
|
+
regex = Regexp.new('[A-Z]+[a-z]+')
|
36
|
+
if !phone.blank? && (phone != 'N/A' || phone != '0') && !regex.match(phone)
|
37
|
+
phone_stripped = phone.gsub(/[^0-9]/, '')
|
38
|
+
phone_step2 = phone_stripped && phone_stripped[0] == '1' ? phone_stripped[1..-1] : phone_stripped
|
30
39
|
final_phone = !(phone_step2 && phone_step2.length < 10) ? "(#{phone_step2[0..2]}) #{(phone_step2[3..5])}-#{(phone_step2[6..9])}" : phone
|
31
40
|
else
|
32
41
|
final_phone = nil
|
33
42
|
end
|
34
|
-
|
43
|
+
final_phone
|
35
44
|
end
|
36
|
-
|
37
45
|
end
|
38
46
|
end
|
data/lib/crm_formatter/web.rb
CHANGED
@@ -1,64 +1,59 @@
|
|
1
|
+
# frozen_string_literal: false
|
2
|
+
|
3
|
+
# require 'rubygems'
|
4
|
+
# require 'active_support'
|
1
5
|
require 'csv'
|
2
6
|
|
3
|
-
|
7
|
+
# StartCrm.run_webs
|
8
|
+
module CrmFormatter
|
4
9
|
class Web
|
5
|
-
|
6
|
-
def initialize(args={})
|
7
|
-
@empty_oa = args.empty?
|
8
|
-
@pos_urls = args.fetch(:pos_urls, [])
|
9
|
-
@neg_urls = args.fetch(:neg_urls, [])
|
10
|
-
@pos_links = args.fetch(:pos_links, [])
|
11
|
-
@neg_links = args.fetch(:neg_links, [])
|
12
|
-
@pos_hrefs = args.fetch(:pos_hrefs, [])
|
13
|
-
@neg_hrefs = args.fetch(:neg_hrefs, [])
|
14
|
-
@pos_exts = args.fetch(:pos_exts, [])
|
15
|
-
@neg_exts = args.fetch(:neg_exts, [])
|
16
|
-
@min_length = args.fetch(:min_length, 2)
|
17
|
-
@max_length = args.fetch(:max_length, 100)
|
18
|
-
end
|
19
|
-
|
20
|
-
def banned_symbols
|
21
|
-
banned_symbols = ["!", "$", "%", "'", "(", ")", "*", "+", ",", "<", ">", "@", "[", "]", "^", "{", "}", "~"]
|
22
|
-
end
|
23
|
-
|
24
|
-
##Call: StartCrm.run_webs
|
25
10
|
def format_url(url)
|
26
11
|
prep_result = prep_for_uri(url)
|
27
12
|
url_hash = prep_result[:url_hash]
|
28
13
|
url = prep_result[:url]
|
29
|
-
url = nil if
|
30
|
-
|
31
|
-
if url
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
14
|
+
url = nil if errors?(url_hash)
|
15
|
+
|
16
|
+
if url&.present?
|
17
|
+
url = normalize_url(url)
|
18
|
+
ext_result = validate_extension(url_hash, url)
|
19
|
+
url_hash = ext_result[:url_hash]
|
20
|
+
url = ext_result[:url]
|
21
|
+
(url = nil if errors?(url_hash)) if url.present?
|
36
22
|
end
|
37
23
|
|
38
|
-
url_hash
|
39
|
-
url_hash =
|
24
|
+
url_hash = consolidate_negs(url_hash)
|
25
|
+
url_hash[:url_f] = url
|
26
|
+
url_hash = extract_path(url_hash) if url.present?
|
27
|
+
url_hash = check_web_status(url_hash)
|
40
28
|
url_hash
|
41
29
|
end
|
42
30
|
|
31
|
+
### COMPARE ORIGINAL AND FORMATTED URL ###
|
32
|
+
def check_web_status(hsh)
|
33
|
+
status = 'invalid' if hsh[:web_neg]&.include?('error')
|
43
34
|
|
44
|
-
|
45
|
-
|
46
|
-
if formatted.present?
|
47
|
-
url_hash[:is_reformatted] = url_hash[:url_path] != formatted
|
35
|
+
if hsh[:url] && hsh[:url_f] && status.nil?
|
36
|
+
status = hsh[:url] != hsh[:url_f] ? 'formatted' : 'unchanged'
|
48
37
|
end
|
49
|
-
|
38
|
+
|
39
|
+
hsh[:web_status] = status if status.present?
|
40
|
+
hsh
|
50
41
|
end
|
51
42
|
|
43
|
+
def consolidate_negs(hsh)
|
44
|
+
neg = hsh[:web_neg].join(', ')
|
45
|
+
hsh[:web_neg] = neg.present? ? neg : nil
|
46
|
+
hsh
|
47
|
+
end
|
52
48
|
|
53
|
-
def
|
54
|
-
errors = url_hash[:
|
49
|
+
def errors?(url_hash)
|
50
|
+
errors = url_hash[:web_neg].map { |web_neg| web_neg.include?('error') }
|
55
51
|
errors.any?
|
56
52
|
end
|
57
53
|
|
58
|
-
|
59
|
-
##Call: StartCrm.run_webs
|
60
54
|
def prep_for_uri(url)
|
61
|
-
url_hash = {
|
55
|
+
url_hash = { web_status: nil, url: url, url_f: nil, url_path: nil, web_neg: [] }
|
56
|
+
|
62
57
|
begin
|
63
58
|
url = url&.split('|')&.first
|
64
59
|
url = url&.split('\\')&.first
|
@@ -74,236 +69,108 @@ module CRMFormatter
|
|
74
69
|
url = url[0..-2] if url.present? && url[-1] == '/'
|
75
70
|
end
|
76
71
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
url_hash
|
81
|
-
url_hash =
|
82
|
-
else
|
83
|
-
url_hash[:neg] << "error: syntax"
|
84
|
-
url_hash[:formatted_url] = url
|
72
|
+
banned_symbols = ['!', '$', '%', "'", '(', ')', '*', '+', ',', '<', '>', '@', '[', ']', '^', '{', '}', '~']
|
73
|
+
url = nil if url.present? && banned_symbols.any? { |symb| url&.include?(symb) }
|
74
|
+
unless url.present?
|
75
|
+
url_hash[:web_neg] << 'error: syntax'
|
76
|
+
url_hash[:url_f] = url
|
85
77
|
end
|
86
|
-
|
87
|
-
|
88
|
-
url_hash[:neg] << "error: #{e}"
|
89
|
-
url = nil
|
90
|
-
url_hash
|
91
|
-
end
|
92
|
-
|
93
|
-
prep_result = { url_hash: url_hash, url: url }
|
94
|
-
end
|
95
|
-
|
96
|
-
|
97
|
-
##Call: StartCrm.run_webs
|
98
|
-
def run_uri(url_hash, url)
|
99
|
-
begin
|
100
|
-
uri = URI(url)
|
101
|
-
host_parts = uri.host&.split(".")
|
102
|
-
|
103
|
-
url_hash = compare_criteria(url_hash, host_parts, 'pos_exts', 'equal') if !@empty_oa
|
104
|
-
url_hash = compare_criteria(url_hash, host_parts, 'neg_exts', 'equal') if !@empty_oa
|
105
|
-
|
106
|
-
host = uri.host
|
107
|
-
scheme = uri.scheme
|
108
|
-
url = "#{scheme}://#{host}" if host.present? && scheme.present?
|
109
|
-
url = "http://#{url}" if url[0..3] != "http"
|
110
|
-
url = url.gsub("//", "//www.") if !url.include?("www.")
|
111
|
-
samp_url = convert_to_scheme_host(url)
|
112
|
-
|
113
|
-
url = convert_to_scheme_host(url) if url.present?
|
114
|
-
url_extens_result = check_url_extens(url_hash, url)
|
115
|
-
url_hash = url_extens_result[:url_hash]
|
116
|
-
url = url_extens_result[:url]
|
117
|
-
|
118
|
-
rescue Exception => e
|
119
|
-
url_hash[:neg] << "error: #{e}"
|
78
|
+
rescue StandardError => error
|
79
|
+
url_hash[:web_neg] << "error: #{error}"
|
120
80
|
url = nil
|
121
81
|
url_hash
|
122
82
|
end
|
123
|
-
|
124
|
-
|
125
|
-
end
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
if criteria_list.present?
|
161
|
-
if target.is_a?(::String)
|
162
|
-
tars = target.split(', ')
|
163
|
-
else
|
164
|
-
tars = target
|
165
|
-
end
|
166
|
-
|
167
|
-
pn_matches = tars.map do |tar|
|
168
|
-
if criteria_list.present?
|
169
|
-
if include_or_equal == 'include'
|
170
|
-
criteria_list.select { |el| el if tar.include?(el) }.join(', ')
|
171
|
-
elsif include_or_equal == 'equal'
|
172
|
-
criteria_list.select { |el| el if tar == el }.join(', ')
|
173
|
-
end
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
pn_match = pn_matches&.uniq&.sort&.join(', ')
|
178
|
-
if pn_match.present?
|
179
|
-
if list_name.include?('neg')
|
180
|
-
hash[:neg] << "#{list_name}: #{pn_match}"
|
181
|
-
else
|
182
|
-
hash[:pos] << "#{list_name}: #{pn_match}"
|
183
|
-
end
|
184
|
-
end
|
185
|
-
end
|
186
|
-
|
83
|
+
hsh = { url_hash: url_hash, url: url }
|
84
|
+
hsh
|
85
|
+
end
|
86
|
+
|
87
|
+
def normalize_url(url)
|
88
|
+
return unless url.present?
|
89
|
+
uri = URI(url)
|
90
|
+
scheme = uri&.scheme
|
91
|
+
host = uri&.host
|
92
|
+
url = "#{scheme}://#{host}" if host.present? && scheme.present?
|
93
|
+
url = "http://#{url}" if url[0..3] != 'http'
|
94
|
+
|
95
|
+
return unless url.present?
|
96
|
+
url.gsub!('//', '//www.') unless url.include?('www.')
|
97
|
+
url
|
98
|
+
end
|
99
|
+
|
100
|
+
# Source: http://www.iana.org/domains/root/db
|
101
|
+
# Text: http://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
102
|
+
def validate_extension(url_hash, url)
|
103
|
+
return unless url.present?
|
104
|
+
uri_parts = URI(url).host&.split('.')
|
105
|
+
url_exts = uri_parts[2..-1]
|
106
|
+
|
107
|
+
### Finds Errors
|
108
|
+
if url_exts.empty? ## Missing ext.
|
109
|
+
err_msg = 'error: ext.none'
|
110
|
+
else ## Has ext(s), but need to verify validity and count.
|
111
|
+
file_path = './lib/crm_formatter/csv/extensions.csv'
|
112
|
+
iana_list = CSV.read(file_path).flatten
|
113
|
+
matched_exts = iana_list & url_exts
|
114
|
+
|
115
|
+
if matched_exts.empty? ## Has ext, but not valid.
|
116
|
+
err_msg = "error: ext.invalid [#{url_exts.join(', ')}]"
|
117
|
+
elsif matched_exts.count > 1 ## Has too many valid exts, Limit 1.
|
118
|
+
err_msg = "error: ext.valid > 1 [#{matched_exts.join(', ')}]"
|
187
119
|
end
|
188
120
|
end
|
189
|
-
|
190
|
-
hash
|
191
|
-
end
|
192
|
-
|
193
|
-
###### Supporting Methods Below #######
|
194
|
-
|
195
|
-
def extract_link(url_path)
|
196
|
-
url_hash = format_url(url_path)
|
197
|
-
url = url_hash[:formatted_url]
|
198
|
-
link = url_path
|
199
|
-
link_hsh = {url_path: url_path, url: url, link: nil }
|
200
|
-
if url.present? && link.present? && link.length > @min_length
|
201
|
-
url = strip_down_url(url)
|
202
|
-
link = strip_down_url(link)
|
203
|
-
link&.gsub!(url, '')
|
204
|
-
link = link&.split('.net')&.last
|
205
|
-
link = link&.split('.com')&.last
|
206
|
-
link = link&.split('.org')&.last
|
207
|
-
link = "/#{link.split("/").reject(&:empty?).join("/")}" if link.present?
|
208
|
-
link_hsh[:link] = link if link.present? && link.length > @min_length
|
209
|
-
end
|
210
|
-
link_hsh
|
211
|
-
end
|
212
121
|
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
url = url.split('://')
|
219
|
-
url = url[-1]
|
220
|
-
return url
|
122
|
+
if err_msg
|
123
|
+
url_hash[:web_neg] << err_msg
|
124
|
+
url = nil
|
125
|
+
url_hash[:url_f] = nil
|
126
|
+
return { url_hash: url_hash, url: url }
|
221
127
|
end
|
222
|
-
end
|
223
|
-
|
224
128
|
|
225
|
-
|
226
|
-
|
227
|
-
if
|
228
|
-
|
229
|
-
|
230
|
-
flags << "below #{@min_length}" if link.length < @min_length
|
231
|
-
flags << "over #{@max_length}" if link.length > @max_length
|
232
|
-
flags = flags.flatten.compact
|
233
|
-
flags.any? ? valid_link = nil : valid_link = link
|
234
|
-
link_hsh[:valid_link] = valid_link
|
235
|
-
link_hsh[:flags] = flags.join(', ')
|
129
|
+
### Only Non-Errors Get Here ###
|
130
|
+
## Has one valid ext, but need to check if original url exts were > 1. Replace if so.
|
131
|
+
if url_exts.count > matched_exts.count
|
132
|
+
inv_ext = (url_exts - matched_exts).join
|
133
|
+
url = url.gsub(".#{inv_ext}", '')
|
236
134
|
end
|
237
|
-
link_hsh
|
238
|
-
end
|
239
|
-
|
240
|
-
|
241
|
-
def remove_invalid_hrefs(href)
|
242
|
-
href_hsh = {href: href, valid_href: nil, flags: nil }
|
243
|
-
if href.present?
|
244
|
-
@neg_hrefs += get_symbs
|
245
|
-
href = href.split('|').join(' ')
|
246
|
-
href = href.split('/').join(' ')
|
247
|
-
href&.gsub!("(", ' ')
|
248
|
-
href&.gsub!(")", ' ')
|
249
|
-
href&.gsub!("[", ' ')
|
250
|
-
href&.gsub!("]", ' ')
|
251
|
-
href&.gsub!(",", ' ')
|
252
|
-
href&.gsub!("'", ' ')
|
253
135
|
|
254
|
-
|
255
|
-
|
256
|
-
invalid_text = Regexp.new(/[0-9]/)
|
257
|
-
flags << invalid_text&.match(href)
|
258
|
-
href = href&.downcase
|
259
|
-
href = href&.strip
|
260
|
-
|
261
|
-
flags << @neg_hrefs.select { |red| href&.include?(red) }
|
262
|
-
flags = flags.flatten.compact.uniq
|
263
|
-
href_hsh[:valid_href] = href unless flags.any?
|
264
|
-
href_hsh[:flags] = flags.join(', ')
|
265
|
-
end
|
266
|
-
href_hsh
|
136
|
+
ext_result = { url_hash: url_hash, url: url }
|
137
|
+
ext_result
|
267
138
|
end
|
268
139
|
|
140
|
+
###### Supporting Methods Below #######
|
269
141
|
|
270
|
-
def
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
return url
|
142
|
+
def extract_path(url_hash)
|
143
|
+
path_parts = url_hash[:url_f].split('//').last.split('/')[1..-1]
|
144
|
+
path = "/#{path_parts.join('/')}"
|
145
|
+
if path&.length > 2
|
146
|
+
url_hash[:url_path] = path
|
147
|
+
url_hash[:url_f] = url_hash[:url_f].gsub(url_hash[:url_path], '')
|
277
148
|
end
|
149
|
+
url_hash
|
278
150
|
end
|
279
151
|
|
280
|
-
|
281
|
-
#CALL: Formatter.new.remove_ww3(url)
|
152
|
+
# CALL: Wrap.new.remove_ww3(url)
|
282
153
|
def remove_ww3(url)
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
154
|
+
return unless url.present?
|
155
|
+
url.split('.').map { |part| url.gsub!(part, 'www') if part.scan(/ww[0-9]/).any? }
|
156
|
+
url&.gsub!('www.www', 'www')
|
157
|
+
url
|
287
158
|
end
|
288
159
|
|
289
|
-
|
290
160
|
# For rare cases w/ urls with mistaken double slash twice.
|
291
161
|
def remove_slashes(url)
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
#
|
301
|
-
#
|
302
|
-
#
|
303
|
-
#
|
304
|
-
# extensions = CSV.read(file_path)
|
162
|
+
return url unless url.present? && url.include?('//')
|
163
|
+
parts = url.split('//')
|
164
|
+
return parts[0..1].join if parts.length > 2
|
165
|
+
url
|
166
|
+
end
|
167
|
+
|
168
|
+
# def strip_down_url(url)
|
169
|
+
# return unless url.present?
|
170
|
+
# url = url.downcase.strip
|
171
|
+
# url = url.gsub('www.', '')
|
172
|
+
# url = url.split('://')
|
173
|
+
# url[-1]
|
305
174
|
# end
|
306
|
-
|
307
|
-
|
308
175
|
end
|
309
176
|
end
|