crm_formatter 1.0.7.pre.rc.1 → 2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c1a9e605ef7ec90b0c88e80a32cd22fa604a9cc810fe559eb19c389c72c0e494
4
- data.tar.gz: 07b4436cf1d31125e44a0bd71424b7c7c21a50954faa8e14b2792f08fd7391d9
3
+ metadata.gz: 96ed8a01bb47d8aac9c3bb7b95a6e8d261ecc107d6d47c8813088157978239c4
4
+ data.tar.gz: 88125eff101ca1ab5e5fcec3015d8cdb3f5f45571e644cfd54547ab8259f268c
5
5
  SHA512:
6
- metadata.gz: 26357b48e1933ed4a421c9fc8ce1bd01a097d553e7d1810910bb7b891496f341ce456a45cff6313189adf1321e7fe95aad25bf9d4bf6471301dfa61bf4c637c3
7
- data.tar.gz: bb86e8cdcc5ea5526e16d7963687e4dd058e3097bdc2b88f6b65284f5cd282ab7f6890304abf90477e1f7503c0a0023b89f48b41c994467bc0f52feffc8509b5
6
+ metadata.gz: 27ba92aa172d3de3813b6338ac77281f840ce7d158a29b54ffa329edf6eac4f2c6394f3608522ec78bb7fe8efaeb9fcd2f034b47e6676c2f275a61e3c39aa2b9
7
+ data.tar.gz: 7b497e726c925b6cf402c3efb20c0baae87adad9bfd5e18d4a024915bc2127533356aa331804282e9325e4b0055db55eb911aa5cb14ec85eb68870d2a78631e5
data/.gitignore CHANGED
@@ -10,6 +10,5 @@
10
10
  crm_formatter-*.gem
11
11
  .DS_Store
12
12
  .idea/
13
+ .xlsx
13
14
  .txt
14
- .csv
15
- !extensions.csv
data/.rspec_status ADDED
@@ -0,0 +1,7 @@
1
+ example_id | status | run_time |
2
+ --------------------------------- | ------- | --------------- |
3
+ ./spec/crm_formatter_spec.rb[1:1] | passed | 0.00103 seconds |
4
+ ./spec/crm_formatter_spec.rb[1:2] | failed | 0.0207 seconds |
5
+ ./spec/crm_formatter_spec.rb[1:3] | passed | 0.00086 seconds |
6
+ ./spec/crm_formatter_spec.rb[1:4] | passed | 0.00009 seconds |
7
+ ./spec/crm_formatter_spec.rb[1:5] | unknown | |
data/.rubocop.yml ADDED
@@ -0,0 +1,10 @@
1
+ inherit_from: .rubocop_todo.yml
2
+
3
+ Metrics/LineLength:
4
+ Enabled: false
5
+
6
+ AllCops:
7
+ DisabledByDefault: false
8
+
9
+ AllCops:
10
+ TargetRubyVersion: 2.5.1
data/.rubocop_todo.yml ADDED
@@ -0,0 +1,188 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2018-05-22 12:28:19 -0500 using RuboCop version 0.56.0.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 1
10
+ # Configuration parameters: Include.
11
+ # Include: **/*.gemspec
12
+ Gemspec/RequiredRubyVersion:
13
+ Exclude:
14
+ - 'crm_formatter.gemspec'
15
+
16
+ # Offense count: 2
17
+ # Cop supports --auto-correct.
18
+ Layout/CommentIndentation:
19
+ Exclude:
20
+ - 'Rakefile'
21
+
22
+ # Offense count: 3
23
+ # Cop supports --auto-correct.
24
+ # Configuration parameters: AllowAdjacentOneLineDefs, NumberOfEmptyLines.
25
+ Layout/EmptyLineBetweenDefs:
26
+ Exclude:
27
+ - 'lib/crm_formatter/wrap.rb'
28
+
29
+ # Offense count: 25
30
+ # Cop supports --auto-correct.
31
+ Layout/EmptyLines:
32
+ Exclude:
33
+ - 'Rakefile'
34
+ - 'lib/crm_formatter.rb'
35
+ - 'lib/crm_formatter/wrap.rb'
36
+
37
+ # Offense count: 1
38
+ # Cop supports --auto-correct.
39
+ # Configuration parameters: EnforcedStyle.
40
+ # SupportedStyles: empty_lines, no_empty_lines
41
+ Layout/EmptyLinesAroundBlockBody:
42
+ Exclude:
43
+ - 'spec/crm_formatter_spec.rb'
44
+
45
+ # Offense count: 2
46
+ # Cop supports --auto-correct.
47
+ # Configuration parameters: EnforcedStyle.
48
+ # SupportedStyles: empty_lines, empty_lines_except_namespace, empty_lines_special, no_empty_lines, beginning_only, ending_only
49
+ Layout/EmptyLinesAroundClassBody:
50
+ Exclude:
51
+ - 'lib/crm_formatter/wrap.rb'
52
+
53
+ # Offense count: 3
54
+ # Cop supports --auto-correct.
55
+ # Configuration parameters: EnforcedStyle.
56
+ # SupportedStyles: empty_lines, empty_lines_except_namespace, empty_lines_special, no_empty_lines
57
+ Layout/EmptyLinesAroundModuleBody:
58
+ Exclude:
59
+ - 'lib/crm_formatter.rb'
60
+ - 'lib/crm_formatter/wrap.rb'
61
+
62
+ # Offense count: 2
63
+ # Cop supports --auto-correct.
64
+ # Configuration parameters: .
65
+ # SupportedStyles: space, no_space
66
+ Layout/SpaceAroundEqualsInParameterDefault:
67
+ EnforcedStyle: no_space
68
+
69
+ # Offense count: 1
70
+ # Cop supports --auto-correct.
71
+ # Configuration parameters: AllowForAlignment.
72
+ Layout/SpaceAroundOperators:
73
+ Exclude:
74
+ - 'lib/crm_formatter/wrap.rb'
75
+
76
+ # Offense count: 1
77
+ # Cop supports --auto-correct.
78
+ # Configuration parameters: EnforcedStyle, EnforcedStyleForEmptyBraces, SpaceBeforeBlockParameters.
79
+ # SupportedStyles: space, no_space
80
+ # SupportedStylesForEmptyBraces: space, no_space
81
+ Layout/SpaceInsideBlockBraces:
82
+ Exclude:
83
+ - 'Gemfile'
84
+
85
+ # Offense count: 6
86
+ # Cop supports --auto-correct.
87
+ # Configuration parameters: EnforcedStyle, EnforcedStyleForEmptyBraces.
88
+ # SupportedStyles: space, no_space, compact
89
+ # SupportedStylesForEmptyBraces: space, no_space
90
+ Layout/SpaceInsideHashLiteralBraces:
91
+ Exclude:
92
+ - 'lib/crm_formatter/wrap.rb'
93
+
94
+ # Offense count: 2
95
+ Lint/Debugger:
96
+ Exclude:
97
+ - 'lib/crm_formatter.rb'
98
+
99
+ # Offense count: 1
100
+ # Configuration parameters: IgnoreImplicitReferences.
101
+ Lint/ShadowedArgument:
102
+ Exclude:
103
+ - 'lib/crm_formatter/wrap.rb'
104
+
105
+ # Offense count: 1
106
+ # Cop supports --auto-correct.
107
+ # Configuration parameters: AllowUnusedKeywordArguments, IgnoreEmptyMethods.
108
+ Lint/UnusedMethodArgument:
109
+ Exclude:
110
+ - 'lib/crm_formatter/wrap.rb'
111
+
112
+ # Offense count: 4
113
+ Lint/UselessAssignment:
114
+ Exclude:
115
+ - 'lib/crm_formatter/wrap.rb'
116
+
117
+ # Offense count: 1
118
+ # Configuration parameters: CountComments, ExcludedMethods.
119
+ Metrics/BlockLength:
120
+ Max: 30
121
+
122
+ # Offense count: 1
123
+ # Configuration parameters: CountComments.
124
+ Metrics/MethodLength:
125
+ Max: 12
126
+
127
+ # Offense count: 1
128
+ Naming/AccessorMethodName:
129
+ Exclude:
130
+ - 'lib/crm_formatter/wrap.rb'
131
+
132
+ # Offense count: 3
133
+ Style/Documentation:
134
+ Exclude:
135
+ - 'spec/**/*'
136
+ - 'test/**/*'
137
+ - 'lib/crm_formatter.rb'
138
+ - 'lib/crm_formatter/dictionary.rb'
139
+ - 'lib/crm_formatter/wrap.rb'
140
+
141
+ # Offense count: 1
142
+ # Configuration parameters: MinBodyLength.
143
+ Style/GuardClause:
144
+ Exclude:
145
+ - 'lib/crm_formatter/wrap.rb'
146
+
147
+ # Offense count: 2
148
+ # Cop supports --auto-correct.
149
+ # Configuration parameters: EnforcedStyle, UseHashRocketsWithSymbolValues, PreferHashRocketsForNonAlnumEndingSymbols.
150
+ # SupportedStyles: ruby19, hash_rockets, no_mixed_keys, ruby19_no_mixed_keys
151
+ Style/HashSyntax:
152
+ Exclude:
153
+ - 'Rakefile'
154
+
155
+ # Offense count: 1
156
+ # Cop supports --auto-correct.
157
+ Style/MutableConstant:
158
+ Exclude:
159
+ - 'lib/crm_formatter/version.rb'
160
+
161
+ # Offense count: 2
162
+ # Cop supports --auto-correct.
163
+ # Configuration parameters: PreferredDelimiters.
164
+ Style/PercentLiteralDelimiters:
165
+ Exclude:
166
+ - 'lib/crm_formatter/wrap.rb'
167
+
168
+ # Offense count: 67
169
+ # Cop supports --auto-correct.
170
+ # Configuration parameters: EnforcedStyle, ConsistentQuotesInMultiline.
171
+ # SupportedStyles: single_quotes, double_quotes
172
+ Style/StringLiterals:
173
+ Exclude:
174
+ - 'Gemfile'
175
+ - 'Rakefile'
176
+ - 'bin/console'
177
+ - 'lib/crm_formatter.rb'
178
+ - 'lib/crm_formatter/version.rb'
179
+ - 'lib/crm_formatter/wrap.rb'
180
+ - 'menu.rb'
181
+ - 'spec/crm_formatter_spec.rb'
182
+ - 'spec/spec_helper.rb'
183
+
184
+ # Offense count: 7
185
+ # Configuration parameters: AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, IgnoredPatterns.
186
+ # URISchemes: http, https
187
+ Metrics/LineLength:
188
+ Max: 549
@@ -0,0 +1,74 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to making participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, gender identity and expression, level of experience,
9
+ nationality, personal appearance, race, religion, or sexual identity and
10
+ orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies both within project spaces and in public spaces
49
+ when an individual is representing the project or its community. Examples of
50
+ representing a project or community include using an official project e-mail
51
+ address, posting via an official social media account, or acting as an appointed
52
+ representative at an online or offline event. Representation of a project may be
53
+ further defined and clarified by project maintainers.
54
+
55
+ ## Enforcement
56
+
57
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
+ reported by contacting the project team at 4rlm@protonmail.ch. All
59
+ complaints will be reviewed and investigated and will result in a response that
60
+ is deemed necessary and appropriate to the circumstances. The project team is
61
+ obligated to maintain confidentiality with regard to the reporter of an incident.
62
+ Further details of specific enforcement policies may be posted separately.
63
+
64
+ Project maintainers who do not follow or enforce the Code of Conduct in good
65
+ faith may face temporary or permanent repercussions as determined by other
66
+ members of the project's leadership.
67
+
68
+ ## Attribution
69
+
70
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71
+ available at [http://contributor-covenant.org/version/1/4][version]
72
+
73
+ [homepage]: http://contributor-covenant.org
74
+ [version]: http://contributor-covenant.org/version/1/4/
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: false
2
+
3
+ source "https://rubygems.org"
4
+
5
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
6
+
7
+ # Specify your gem's dependencies in crm_formatter.gemspec
8
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2018 Adam Booth
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md CHANGED
@@ -1,172 +1,238 @@
1
1
 
2
- # **CRM Formatter**
2
+ # CRM Wrap
3
+ #### Efficiently Reformat, Normalize, and Scrub CRM Contact Data, such as Addresses, Phones and URLs.
3
4
 
4
- #### Reformat and Normalize CRM Contact Data, such as Addresses, Phones and URLs.
5
+ CRM Wrap is perfect for curating high-volume enterprise-scale web scraping, and integrates well with Nokogiri, Mechanize, and asynchronous jobs via Delayed_job or SideKick, to name a few. Web Scraping and Harvesting often gathers a lot of junk to sift through; presenting unexpected edge cases around each corner. CRM Wrap has been developed and refined during the past few years to focus on improving that task.
5
6
 
6
- **CRM Formatter** was originally designed to curate high-volume enterprise-scale asynchronous web scraping via Nokogiri, Mechanize, and Delayed_job. Web Scraping *aka Web Harvesting / Data Mining* is notoriously unreliable *sticky* work with endless edge-cases to overcome. Accurately, yet efficiently curating such data is a constant and evolving task, and will continue to be the core functionality of **CRM Formatter**.
7
- However, it also plays an integral role in routine functions of apps, like formatting, normalizing, and even scrubbing existing databases, and submitted form data before saving to the database; via model callbacks, such as `before_validation` or `before_save`.
7
+ It's also perfect for processing API data, Web Forms, and routine DB normalizing and scrubbing processes. Not only does it reformat Address, Phone, and Web data, it can also accept lists to scrub against, then providing detailed reports about how each piece of data compares with your criteria lists.
8
8
 
9
- ###### The **CRM Formatter** Gem is currently in `--pre` versioning, aka **beta mode** with frequent updates. Formal tests in the gem environment are still on the way.
10
- However, **CRM Formatter** has been developed continuously for several years and is a reliable and integral part of a production CRM data verification app. The process of isolating the various modules into a consolidated open source gem has just recently begun, so documentation is still limited, but is frequently being added and refined.
9
+ The CRM Wrap Gem is currently in '--pre versioning', or 'beta mode' as the process of reorganizing these proprietary, production environment processes from their native app environment into this newly created open source gem. Formal tests in the gem environment are still on the way, as is the documentation. But the processes themselves have been very reliable and an integral part of a very large app dedicated to such services.
11
10
 
12
11
  ## Getting Started
13
- **CRM Formatter** is compatible with Rails 4.2 and 5.0, 5.1 and 5.2 on Ruby 2.2 and later.
12
+ CRM Wrap is compatible with Rails 4.2 and 5.0, 5.1 and 5.2 on Ruby 2.2 and later.
14
13
 
15
14
  In your Gemfile add:
16
-
17
15
  ```
18
- gem 'crm_formatter', '~> 1.0.6.pre.rc.1'
16
+ gem 'crm_formatter', '~> 1.0.8.pre.rc.1'
19
17
  ```
20
-
21
18
  Or to install locally:
22
-
23
19
  ```
24
20
  gem install crm_formatter --pre
25
21
  ```
26
-
27
22
  ## Usage
23
+ Using CRM Wrap in your app is very simple, and could be accessed from your app's concerns, , helpers, lib, models, or services, but depends on the scope, location, and size of your application and server. For simple form submission validations the model callback is typically ideal. For database normalizing tasks the concerns, helpers, or lib is typically ideal. For long running processes like web scraping or high volume APIs calls, like Google Linkedin, or Twitter the lib or services might be ideal (asynchronous multithreaded even better)
28
24
 
29
- ##### Usage is organized into three sections, Overview, Methods and Examples.
30
-
31
- ### I. Overview
32
-
33
- #### 1. Access and Integration
34
- ##### Using **CRM Formatter** in your app is very simple, and could be accessed from your app's concerns, controllers, helpers, lib, models, or services, but depends on the scope, location, and size of your application and server.
35
- * Simple form submission validations: model callback typically ideal.
36
- * Database normalizing tasks: wrapper method in concerns, helpers, or lib typically ideal.
37
- * Long running processes like web scraping or high volume APIs calls, like Google Linkedin, or Twitter: the lib or services might be ideal (multithreaded asynchronously even better)
38
-
39
- #### 2. Hash Response
40
- ##### Formatted data will always be returned as a hash datatype the following key-value pairs:
41
- * The originally submitted data as the first pair.
42
- * Formatted data in the remaining pairs.
43
- * A T/F boolean indicator pair regarding if the original and formatted data are different.
44
-
45
- #### 3. Optional Arguments *OA*
46
- ##### A class can be instantiated with optional arguments *OA*.
47
- * OA house the criteria by which you'd like to scrub your data.
48
- * Each is either 'Pos' or 'Neg', for more accurate reporting of your scrubbing results.
49
- * List of available Web OA is below, and each accepts data in the hash datatype, aka 'keyword-args'.
50
- * For example, you might want to know which URLs contain 'twitter', 'facebook', or 'linkedin' either to focus on developing a list of business social media links, or perhaps you want to use such a list to better avoid such links.
51
- * *OA is currently only available for the Web class.*
52
- * *Address OA & Phone OA will be available in a future release.*
53
-
54
- ### II. Methods
55
- ##### CRM Formatter**'s top level module is `CRMFormatter` and contains the following three classes:
56
- 1. Address: `CRMFormatter::Address.new`
57
- 2. Phone: `CRMFormatter::Address.new`
58
- 3. Web: `CRMFormatter::Address.new`
59
-
60
- ###### Then assign the above to a variable name of your choosing.
61
- `addr_formatter = CRMFormatter::Address.new`
62
- `@addr_formatter = CRMFormatter::Address.new`
63
-
64
- ###### Web accepts optional arguments *OA* as a Hash (with Key-Value pairs)
65
- Without OA: Instantiate normally if not using OA.
66
- `web_formatter = CRMFormatter::Web.new`
67
-
68
- With OA: Follow the steps to use Web OA:
69
- 1. Available Web OA and the required Key-Value naming and datatypes.
70
- * Only list the OA K-V Pairs you're using. No need to list empty values. It's not all or nothing. These are empty to illustrate the expected datatypes.
71
-
72
- Below is how the OA are received in the Web class at initialization.
73
- **3. Web Examples at the very bottom has a very detailed example including how OA can be used.**
74
- ```
75
- def initialize(args={})
76
- @empty_oa = args.empty?
77
- @pos_urls = args.fetch(:pos_urls, [])
78
- @neg_urls = args.fetch(:neg_urls, [])
79
- @pos_links = args.fetch(:pos_links, [])
80
- @neg_links = args.fetch(:neg_links, [])
81
- @pos_hrefs = args.fetch(:pos_hrefs, [])
82
- @neg_hrefs = args.fetch(:neg_hrefs, [])
83
- @pos_exts = args.fetch(:pos_exts, [])
84
- @neg_exts = args.fetch(:neg_exts, [])
85
- @min_length = args.fetch(:min_length, 2)
86
- @max_length = args.fetch(:max_length, 100)
87
- end
25
+ ### Class Names
26
+ CrmFormatter contains three classes, which can be accessed like below with local or instance variables; you can name them anything you like.
88
27
  ```
28
+ adr_formatter = CrmFormatter::Address.new
29
+ @adr_formatter = CrmFormatter::Address.new
89
30
 
90
- Example: Below is the syntax for how to use OA.
91
- There are both Positive and Negative. They work the same and could just be included in the same array if you prefer. But they are intended to help you scrub data against negative criteria and for positive criteria.
92
- ```
93
- oa_args = { neg_urls: %w(approv insur invest loan quick rent repair),
94
- neg_links: %w(buy call cash cheap click gas insta),
95
- neg_hrefs: %w(after anounc apply approved blog buy call click),
96
- neg_exts: %w(au ca edu es gov in ru uk us),
97
- min_length: 0,
98
- max_length: 30
99
- }
31
+ ph_formatter = CrmFormatter::Phone.new
32
+ @ph_formatter = CrmFormatter::Phone.new
100
33
 
101
- @web_formatter = CRMFormatter::Web.new(oa_args)
34
+ web_formatter = CrmFormatter::Web.new
35
+ @web_formatter = CrmFormatter::Web.new
102
36
  ```
103
37
 
104
- #### 1. Address Methods
105
-
106
- `get_full_address()` takes a hash of address parts then runs each through their respective formatters, then also adds an additional feature of combining them into a long full address string, and indicates if there were any changes from the original version and newly formatted.
38
+ ### Available Methods in Each Class
107
39
 
40
+ ## Address Methods
41
+ These are the methods available to you. You can use them a la cart, for example if you just wanted to wrap all your states, or you could combine the entire address into `get_full_address()` which will run each of the related methods for you. It also adds an additional hash pair containing the full address as a single string. There is also an indicator pair to report if there were any changes from the original version to the newly formatted.
108
42
  ```
109
- addr_formatter = CRMFormatter::Address.new
110
-
43
+ addr_formatter = CrmFormatter::Address.new
111
44
  full_address_hash = {street: street, city: city, state: state, zip: zip}
112
-
113
45
  addr_formatter.get_full_address(full_address_hash)
114
-
115
46
  addr_formatter.format_street(street_string)
116
-
117
47
  addr_formatter.format_city(city_string)
118
-
119
48
  addr_formatter.format_state(state_string)
120
-
121
49
  addr_formatter.format_zip(zip)
122
-
123
50
  addr_formatter.format_full_address(adr = {})
124
-
125
51
  addr_formatter.compare_versions(original, formatted)
126
-
127
52
  ```
128
53
 
129
54
  #### Phone Methods
55
+ Phone only has two methods, with a subtle but important distinction between them. For simply formatting a known phone, use `format_phone` to convert to the normalized (555) 123-4567 wrap. Use `validate_phone` if either your phone data has a bunch of text and special characters to remove, or if you aren't even sure that it is a phone, as it will help determine if the phone number seem legitimate. If so, it then passes it along to `format_phone`.
56
+ ```
57
+ ph_formatter = CrmFormatter::Phone.new
58
+ ph_formatter.validate_phone(phone)
59
+ ph_formatter.format_phone(phone)
60
+ ```
61
+
62
+ #### Web Methods
63
+ The examples on this README are from `format_url` method. The others are for web scraping, which will be documented in the near future.
64
+ ```
65
+ web_formatter = CrmFormatter::Web.new
66
+ web_formatter.format_url(url)
67
+ web_formatter.extract_path(url_path)
68
+ web_formatter.remove_invalid_links(link)
69
+ web_formatter.remove_invalid_hrefs(href)
70
+ web_formatter.convert_to_scheme_host(url)
71
+ ```
130
72
 
131
- Subtle but important distinction between 'format_phone' which simply puts a phone in any format, like 555-123-4567 into normalized (555) 123-4567, and 'validate_phone' which also uses 'format_phone' to normalize its output, but is mainly tasked with determining if the phone number seem legitimate. If you know for sure that it is a phone number, but just want to normalize then first try format_phone. If you are doing web scraping or throwing in strings of text mixed with phones, then validate_phone might work better.
73
+ ## Examples
74
+ #### Below are two examples using the Web `format_url(url)` method:
132
75
 
76
+ ### Example 1: 6 Example URLs Submitted:
77
+ Custom Method to Query URLs
78
+ ```
79
+ def self.get_urls
80
+ urls = %w(website.com website.business.site website website.fake website.fake.com website.com.fake)
81
+ end
82
+ ```
83
+ Custom Wrapper Method
84
+ ```
85
+ def self.run_webs
86
+ web = CrmFormatter::Web.new
87
+ formatted_url_hashes = get_urls.map do |url|
88
+ url_hash = web.format_url(url)
89
+ end
90
+ end
91
+ ```
92
+ Results as Hash: 3/6 Reformatted due to invalid or no url extensions. 3 Reformatted and Normalized with `http://www.`
93
+ URL Extensions, **.com, .net, .fake** cross referenced with official IANA list.
94
+ ```
95
+ [ {:reformatted=>true, :url_path=>"website.com", :formatted_url=>"http://www.website.com", :neg=>[], :pos=>[]},
96
+ {:reformatted=>false, :url_path=>"website.business.site", :formatted_url=>nil, :neg=>["error: ext.valid > 1 [business, site]"], :pos=>[]}, {:reformatted=>false, :url_path=>"website", :formatted_url=>nil, :neg=>["error: ext.none"], :pos=>[]},
97
+ {:reformatted=>false, :url_path=>"website.fake", :formatted_url=>nil, :neg=>["error: ext.invalid [fake]"], :pos=>[]},
98
+ {:reformatted=>true, :url_path=>"website.fake.com", :formatted_url=>"http://www.website.com", :neg=>[], :pos=>[]},
99
+ {:reformatted=>true, :url_path=>"website.com.fake", :formatted_url=>"http://www.website.com", :neg=>[], :pos=>[]}
100
+ ]
133
101
  ```
134
- ph_formatter = CRMFormatter::Phone.new
135
102
 
136
- ph_formatter.validate_phone(phone)
103
+ ### Example 2: 6 Real URLs with Scrubbing Feature, but same configuration as above:
104
+ **Intentionally partially obfuscated**
105
+ ```
106
+ urls = %w(approvXXXutosales.org autXXXartfinance.com leXXXummitautorepair.net melXXXtoyota.com norXXXastacura.com XXXmazda.com)
107
+ ```
108
+ These results list 'neg' and 'pos', which are the criteria I was scrubbing against. I wanted to find the URLs of franchise auto dealers and exclude ancillary URLs.
109
+ ```
110
+ [{:reformatted=>true, :url_path=>"approvXXXutosales.org", :formatted_url=>"http://www.approvXXXutosales.org", :neg=>["neg_urls: approv"], :pos=>[]},
111
+ {:reformatted=>true, :url_path=>"autXXXartfinance.com", :formatted_url=>"http://www.autXXXartfinance.com", :neg=>["neg_urls: financ"], :pos=>["pos_urls: smart"]},
112
+ {:reformatted=>true, :url_path=>"leXXXummitautorepair.net", :formatted_url=>"http://www.leXXXummitautorepair.net", :neg=>["neg_urls: repair"], :pos=>[]},
113
+ {:reformatted=>true, :url_path=>"melXXXtoyota.com", :formatted_url=>"http://www.melXXXtoyota.com", :neg=>[], :pos=>["pos_urls: toyota"]},
114
+ {:reformatted=>true, :url_path=>"norXXXastacura.com", :formatted_url=>"http://www.norXXXastacura.com", :neg=>[], :pos=>["pos_urls: acura"]},
115
+ {:reformatted=>true, :url_path=>"XXXmazda.com", :formatted_url=>"http://www.XXXmazda.com", :neg=>[], :pos=>["pos_urls: mazda"]}
116
+ ]
117
+ ```
137
118
 
138
- ph_formatter.format_phone(phone)
119
+ ## Quick Setup Guide
139
120
 
121
+ #### Create a Wrapper with a custom Class and Method(s)
122
+ This is just one of several ways to configure. If you only need the gem for formatting form data, you could just create a callback method in your model, but to scrub a database or process API and Harvested data, you'll want a dedicated process so you can manage the queue, criteria, and results. If you don't already have one, this example will show you how. Concerns, Helpers and Models might be fine for smaller tasks, but for heavier tasks Lib and Services are ideal, but depends on your specifications.
123
+ ```
124
+ # /app/lib/start_crm.rb
140
125
  ```
126
+ ```
127
+ class StartCrm
128
+ def initialize
129
+ @web = CrmFormatter::Web.new
130
+ end
141
131
 
142
- #### Web Methods
132
+ def run_webs
133
+ formatted_url_hashes = urls.map do |url|
134
+ url_hash = @web.format_url(url)
135
+ end
136
+ end
137
+ end
138
+ ```
139
+ You may need to edit your application config file to recognize your new class.
140
+ ```
141
+ #/app/config/application.rb
143
142
 
143
+ config.eager_load_paths << Rails.root.join('lib/**')
144
+ config.eager_load_paths += Dir["#{config.root}/lib/**/"]
145
+ ```
146
+ #### Run in Rails Console
147
+ In this example, we'll run it in Rails Console like below, but you could also create a Rake Task and integrate it with a scheduled Cron Job. You could also run the process through your contoller actions in a GUI. If accessing through the front end, you might want to do it asynchronously with gems like Delayed_job or SideKick so you can free-up your controllers and prevent your front end from freezing while waiting for the job to complete; if running very large tasks.
148
+ ```
149
+ 2.5.1 :001 > StartCrm.new.run_webs
144
150
  ```
145
- web_formatter = CRMFormatter::Web.new
151
+ #### Instance vs Class Methods in your Wrapper
152
+ In the above example, `run_webs` is an instance method, but a class method `self.run_webs` could work well too, like the example below. At lease in the early stages, this is a little easier if you keep running it in Rails C, because not requiring initializing means less to type to call it. Next you could setup your class with various methods to assist your process, like so:
153
+ ```
154
+ class StartCrm
155
+ def self.run_webs
156
+ web = CrmFormatter::Web.new
146
157
 
147
- web_formatter.format_url(url)
158
+ formatted_url_hashes = query_accounts.map do |act|
159
+ url_hsh = web.format_url(act.url)
148
160
 
149
- web_formatter.extract_link(url_path)
161
+ if url_hash[:reformatted]
150
162
 
151
- web_formatter.remove_invalid_links(link)
163
+ act_hsh = { url: url_hsh[:formatted_url],
164
+ url_sts: url_hsh[:formatted_url],
165
+ scrub_date: Time.now
166
+ }
167
+ else
168
+ act_hsh = { scrub_date: Time.now }
169
+ end
152
170
 
153
- web_formatter.remove_invalid_hrefs(href)
171
+ act.update(act_hsh)
172
+ end
173
+ end
154
174
 
155
- web_formatter.convert_to_scheme_host(url)
175
+ def self.query_accounts
176
+ accounts = Account.where(url_sts: 'Invalid').limit(50)
177
+ end
178
+ end
179
+ ```
156
180
 
181
+ #### Data Response in a Hash
182
+ CRM Wrap returns data as a hash, which includes your original unaltered data you submitted, the formatted data, a T/F boolean indicator regarding if the original and formatted data are different, and for some methods, negs and pos regarding your criteria to scrub against. In the above example, the returned data from each submitted url would resemble the one below.
183
+ ```
184
+ # format_url method returns data like below this example...
185
+ # url_hash = {:reformatted=>false,
186
+ :url_path=>"https://www.steXXXXXXmitsubishiserviceandpartscenter.com",
187
+ :formatted_url=>"https://www.steXXXXXXmitsubishiserviceandpartscenter.com",
188
+ :neg=>["neg_urls: parts, rv, service"],
189
+ :pos=>["pos_urls: mitsubishi"]
190
+ }
157
191
  ```
158
192
 
159
- ### III. Examples
193
+ #### Optional Arguments OA
194
+ A class can be instantiated with optional arguments 'OA', to load your criteria to scrub against. Only list the OA K-V Pairs you're using. No need to list empty values. It's not all or nothing. These are empty to illustrate the expected datatypes.
195
+ **OA is currently only available for the Web class, but will soon be available in the Address & Phone classes.**
196
+
197
+ Below is how the OA are received in the Web class at initialization.
198
+ ```
199
+ def initialize(args={})
200
+ @empty_oa = args.empty?
201
+ @pos_urls = args.fetch(:pos_urls, [])
202
+ @neg_urls = args.fetch(:neg_urls, [])
203
+ @pos_links = args.fetch(:pos_links, [])
204
+ @neg_links = args.fetch(:neg_links, [])
205
+ @pos_hrefs = args.fetch(:pos_hrefs, [])
206
+ @neg_hrefs = args.fetch(:neg_hrefs, [])
207
+ @pos_exts = args.fetch(:pos_exts, [])
208
+ @neg_exts = args.fetch(:neg_exts, [])
209
+ @min_length = args.fetch(:min_length, 2)
210
+ @max_length = args.fetch(:max_length, 100)
211
+ end
212
+ ```
213
+
214
+ Below is the syntax for how to use OA. Positive and Negative options available, and essentially function the same, but allow additional options for scrubbing data.
215
+ ```
216
+ oa_args = { neg_urls: %w(approv insur invest loan quick rent repair),
217
+ neg_links: %w(buy call cash cheap click gas insta),
218
+ neg_hrefs: %w(after anounc apply approved blog buy call click),
219
+ neg_exts: %w(au ca edu es gov in ru uk us),
220
+ min_length: 0,
221
+ max_length: 30
222
+ }
223
+ @web_formatter = CrmFormatter::Web.new(oa_args)
224
+ ```
225
+
226
+ ### III. Detailed Examples
160
227
  Some of the examples are excessively verbose to help illustrate the datatypes and processes. Here are a few guidelines and tips:
161
- **3. Web Examples at the very bottom is the most detailed and recent. It might be a good place to start.**
162
- *These are just examples below, not strict usage guides ...*
163
228
 
164
- #### 1. Address Examples
229
+ *These are just examples, not strict usage guides ...*
165
230
 
231
+ #### 1. Address Examples
166
232
  ```
167
233
  def self.run_adrs
168
234
 
169
- crm_address_formatter = CRMFormatter::Address.new
235
+ crm_address_formatter = CrmFormatter::Address.new
170
236
 
171
237
  contacts = Contact.where.not(full_address: nil)
172
238
 
@@ -184,11 +250,9 @@ end
184
250
  ```
185
251
 
186
252
  #### 2. Phone Examples
187
-
188
- In the phone example, format_all_phone_in_my_db could be a custom wrapper method, which when called by Rails C or from a front end GUI process, could grab all phones in db meeting certain criteria to be scrubbed. The results will always be in hash format, such as below.... phone_hash
189
-
253
+ In the phone example, format_all_phone_in_my_db could be a custom wrapper method, which when called by Rails C or from a front end GUI process, could grab all phones in db meeting certain criteria to be scrubbed. The results will always be in hash wrap, such as below.... phone_hash
190
254
  ```
191
- @crm_phone = CRMFormatter::Phone.new
255
+ @crm_phone = CrmFormatter::Phone.new
192
256
 
193
257
  def self.format_all_phone_in_my_db
194
258
  phones_from_contacts = Contacts.where.not(phone: nil)
@@ -199,15 +263,11 @@ def self.format_all_phone_in_my_db
199
263
 
200
264
  end
201
265
 
202
- phone_hash = { phone: 555-123-4567, valid_phone: (555) 123-4567, phone_edit: true }
266
+ phone_hash = { phone: 555-123-4567, phone_f: (555) 123-4567, phone_status: true }
203
267
  ```
204
268
 
205
269
  #### 3. Web Examples
206
-
207
- The steps below will show you an option for how you could integrate larger processes in your app.
208
- 1. Create a wrapper method you can call from an action or Rails C. In this example, a new class was also created in Lib for that purpose, as there could be related methods to create.
209
- * These examples only include `CRMFormatter::Web.new.format_url(url)` method. There are several additional methods available to you. Documentation is on the way, but in the mean time, try out the below example, then play around with the others too.
210
-
270
+ The steps below will show you an option for how you could integrate larger processes in your app. Create a wrapper method you can call from an action or Rails C. In this example, a new class was also created in Lib for that purpose, as there could be related methods to create.
211
271
  ```
212
272
  # /app/lib/start_crm.rb
213
273
 
@@ -216,7 +276,7 @@ class StartCrm
216
276
  ##Rails C: StartCrm.run_webs
217
277
  def self.run_webs
218
278
  oa_args = get_args
219
- web = CRMFormatter::Web.new(oa_args)
279
+ web = CrmFormatter::Web.new(oa_args)
220
280
 
221
281
  formatted_url_hashes = get_urls.map do |url|
222
282
  url_hash = web.format_url(url)
@@ -227,15 +287,14 @@ class StartCrm
227
287
 
228
288
  end
229
289
  ```
230
- 2. Make sure to modify your application config file to recognize your new class.
231
-
290
+ Application Config
232
291
  ```
233
292
  #/app/config/application.rb
234
293
 
235
294
  config.eager_load_paths << Rails.root.join('lib/**')
236
295
  config.eager_load_paths += Dir["#{config.root}/lib/**/"]
237
296
  ```
238
- 3. Create your db query or put together a list of URLs to process, along with any OA to include. The below example is very verbose, but designed to be helpful. In reality, you might have various criteria saved in the db rather than writing it out.
297
+ Create your db query or put together a list of URLs to process, along with any OA to include. The below example is very verbose, but designed to be helpful. In reality, you might have various criteria saved in the db rather than writing it out.
239
298
  In this example, we have auto dealer URLs. In this process, we're focusing on franchise dealers.
240
299
  ```
241
300
  def self.get_args
@@ -251,48 +310,46 @@ def self.get_urls
251
310
  urls = ["https://www.stevXXXXXXmitsubishiserviceandpartscenter.com", "https://www.perXXXXXXchryslerjeepcenterville.com", "http://www.peXXXXXXchryslerjeepcenterville.com", "http://www.colXXXXXXchryslerdodgejeepram.com"]
252
311
  end
253
312
  ```
254
- 4. Run your class and wrapper method in Rails C. By creating the wrapper method, you have set up the entire process to run like a runner. In reality, you might have several different criteria accessible from a GUI or even running in Cron Jobs.
255
-
256
- `2.5.1 :001 > StartCrm.run_webs`
257
-
258
- 5. Results are always in a Hash, like below. The URLs are slightly obfuscated out of respect (it's not a bug). These are examples from a large DB that runs on a loop 24/7 and gets to each organization about once a week, so it's already pretty well up to date, so there aren't any big changes below, but there are still a few things to point out.
259
-
260
- * `:is_reformatted` indicates T/F if url_path and `:formatted_url` differ. If False, then it means they are the same, or the `:url_path` had significant errors which prevented it from being formatted, thus `:formatted_url` would be nil in such a case. The reality is that you might have some URLs that are so far off that, that they can't be reliably reformatted, so better to only let them pass if we are confident that they are reliable.
261
-
262
- * `:url_path` is the url originally submitted by the client. It can include directory links on the end too, '/careers/, '/about-us/', etc.
263
-
264
- * `:formatted_url` is the formatted version of `:url_path`. It will be stripped of additional paths, '/deals/', '/staff/', etc. Also, often times people ommit 'http://:' and 'www' in CRMs. This can sometimes cause errors for users or Mechanized Web Scrapers. So, those will always be included to ensure consistency. In our production app we follow up the formatting with url redirect following, which our configurations require the entire path, so it will always be included. The redirect following gem is already being worked on and will be released as an additional gem shortly.
265
-
266
- * `:neg` is an array of all the errors and negative, undesirable criteria to scrub against. If you include the criteria in OA `neg_urls:`, like above, it will automatically scrub and report. Regardless, any errors will also be included in there. So, if the url was not ultimately formatted, there will be details regarding why in `:neg`.
267
-
268
- * `:pos` is the opposite, which highlights positive criteria you might be looking for. It too is available in OA via `pos_urls:`, like above.
269
-
313
+ Run your class and wrapper method in Rails C. By creating the wrapper method, you have set up the entire process to run like a runner. In reality, you might have several different criteria accessible from a GUI or even running in Cron Jobs.
314
+ ```
315
+ 2.5.1 :001 > StartCrm.run_webs
316
+ ```
317
+ Results are always in a Hash, like below. The URLs are slightly obfuscated out of respect (it's not a bug). These are examples from a large DB that runs on a loop 24/7 and gets to each organization about once a week, so it's already pretty well up to date, so there aren't any big changes below, but there are still a few things to point out below the code example.
270
318
  ```
271
- [ {:is_reformatted=>false,
319
+ [ {:reformatted=>false,
272
320
  :url_path=>"https://www.steXXXXXXmitsubishiserviceandpartscenter.com",
273
321
  :formatted_url=>"https://www.steXXXXXXmitsubishiserviceandpartscenter.com",
274
322
  :neg=>["neg_urls: parts, rv, service"],
275
323
  :pos=>["pos_urls: mitsubishi"]},
276
324
 
277
- {:is_reformatted=>false,
325
+ {:reformatted=>false,
278
326
  :url_path=>"https://www.perXXXXXXchryslerjeepcenterville.com",
279
327
  :formatted_url=>"https://www.perXXXXXXchryslerjeepcenterville.com",
280
328
  :neg=>["neg_urls: rv"],
281
329
  :pos=>["pos_urls: chrysler, jeep"]},
282
330
 
283
- {:is_reformatted=>false,
331
+ {:reformatted=>false,
284
332
  :url_path=>"http://www.pXXXXXXchryslerjeepcenterville.com",
285
333
  :formatted_url=>"http://www.XXXXXXechryslerjeepcenterville.com",
286
334
  :neg=>["neg_urls: rv"],
287
335
  :pos=>["pos_urls: chrysler, jeep"]},
288
336
 
289
- {:is_reformatted=>false,
337
+ {:reformatted=>false,
290
338
  :url_path=>"http://www.colXXXXXXchryslerdodgejeepram.com",
291
339
  :formatted_url=>"http://www.colXXXXXXchryslerdodgejeepram.com",
292
340
  :neg=>["neg_urls: rv"],
293
341
  :pos=>["pos_urls: chrysler, dodge, jeep, ram"]}
294
342
  ]
295
343
  ```
344
+ `:reformatted` indicates T/F if url_path and `:formatted_url` differ. If False, then it means they are the same, or the `:url_path` had significant errors which prevented it from being formatted, thus `:formatted_url` would be nil in such a case. The reality is that you might have some URLs that are so far off that, that they can't be reliably reformatted, so better to only let them pass if we are confident that they are reliable.
345
+
346
+ `:url_path` is the url originally submitted by the client. It can include directory links on the end too, '/careers/, '/about-us/', etc.
347
+
348
+ `:formatted_url` is the formatted version of `:url_path`. It will be stripped of additional paths, '/deals/', '/staff/', etc. Also, often times people ommit 'http://:' and 'www' in CRMs. This can sometimes cause errors for users or Mechanized Web Scrapers. So, those will always be included to ensure consistency. In our production app we follow up the formatting with url redirect following, which our configurations require the entire path, so it will always be included. The redirect following gem is already being worked on and will be released as an additional gem shortly.
349
+
350
+ `:neg` is an array of all the errors and negative, undesirable criteria to scrub against. If you include the criteria in OA `neg_urls:`, like above, it will automatically scrub and report. Regardless, any errors will also be included in there. So, if the url was not ultimately formatted, there will be details regarding why in `:neg`.
351
+
352
+ `:pos` is the opposite, which highlights positive criteria you might be looking for. It too is available in OA via `pos_urls:`, like above.
296
353
 
297
354
 
298
355
  ## Author