ramparts 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/stale.yml +24 -0
- data/.gitignore +1 -0
- data/.rspec +1 -0
- data/.rubocop.yml +54 -0
- data/.travis.yml +8 -0
- data/CHANGELOG.md +18 -0
- data/CONTRIBUTING.md +46 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +52 -0
- data/LICENSE.md +21 -0
- data/README.md +226 -0
- data/ROADMAP.md +21 -0
- data/Rakefile +0 -0
- data/lib/ramparts.rb +3 -0
- data/lib/ramparts/base.rb +99 -0
- data/lib/ramparts/data/list_of_email_domains.rb +73 -0
- data/lib/ramparts/helpers.rb +46 -0
- data/lib/ramparts/parsers/email_parser.rb +88 -0
- data/lib/ramparts/parsers/phone_parser.rb +137 -0
- data/lib/ramparts/parsers/url_parser.rb +30 -0
- data/lib/ramparts/version.rb +5 -0
- data/ramparts.gemspec +24 -0
- data/spec/data/email_and_phone_data/falsy_email_and_phone_data.rb +6 -0
- data/spec/data/email_and_phone_data/truthy_email_and_phone_data.rb +33 -0
- data/spec/data/email_data/falsy_email_data.rb +6 -0
- data/spec/data/email_data/truthy_email_data.rb +87 -0
- data/spec/data/phone_data/falsy_phone_data.rb +6 -0
- data/spec/data/phone_data/truthy_phone_data.rb +109 -0
- data/spec/data/url_data/falsy_url_data.rb +6 -0
- data/spec/data/url_data/truthy_url_data.rb +12 -0
- data/spec/parsers/email_and_phone_parser_spec.rb +44 -0
- data/spec/parsers/email_parser_spec.rb +60 -0
- data/spec/parsers/phone_parser_spec.rb +56 -0
- data/spec/parsers/url_parser_spec.rb +15 -0
- data/spec/spec_constants.rb +3 -0
- data/spec/spec_helper.rb +87 -0
- metadata +147 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: 9623918d77fd13b5006a4436fd147eb2cbe3c790
|
|
4
|
+
data.tar.gz: cc81eb43b0c2378822b9bce67e35f7f6951c609d
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 3532ef87767a9649195d6508284d1cc5603cbc3aaa7e8e7a3809c1db748bf36e0835a9591713418cd5c548f1348c128fca9f6965fb5fa2a4690155cc23a565ad
|
|
7
|
+
data.tar.gz: 13cf057ea279241cabce865dd559db6178820fc9b4b44039f38bc30ae6101d1d2d3721094c2eddd45a28ded47bb7572219baabbbbaab3464cd4333f3e54d07d3
|
data/.github/stale.yml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Configuration for probot-stale - https://github.com/probot/stale
|
|
2
|
+
|
|
3
|
+
# Number of days of inactivity before an Issue or Pull Request becomes stale
|
|
4
|
+
daysUntilStale: 60
|
|
5
|
+
# Number of days of inactivity before a stale Issue or Pull Request is closed
|
|
6
|
+
daysUntilClose: 7
|
|
7
|
+
# Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
|
|
8
|
+
exemptLabels:
|
|
9
|
+
- pinned
|
|
10
|
+
- security
|
|
11
|
+
- "[Status] Maybe Later"
|
|
12
|
+
# Label to use when marking as stale
|
|
13
|
+
staleLabel: wontfix
|
|
14
|
+
# Comment to post when marking as stale. Set to `false` to disable
|
|
15
|
+
markComment: >
|
|
16
|
+
This issue has been automatically marked as stale because it has not had
|
|
17
|
+
recent activity. It will be closed if no further activity occurs. Thank you
|
|
18
|
+
for your contributions.
|
|
19
|
+
# Comment to post when removing the stale label. Set to `false` to disable
|
|
20
|
+
unmarkComment: false
|
|
21
|
+
# Comment to post when closing a stale Issue or Pull Request. Set to `false` to disable
|
|
22
|
+
closeComment: false
|
|
23
|
+
# Limit to only `issues` or `pulls`
|
|
24
|
+
# only: issues
|
data/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
coverage
|
data/.rspec
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
--require spec_helper
|
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
AllCops:
|
|
2
|
+
TargetRubyVersion: 2.3
|
|
3
|
+
Exclude:
|
|
4
|
+
- 'ramparts.gemspec'
|
|
5
|
+
- 'Gemfile'
|
|
6
|
+
|
|
7
|
+
ClassLength:
|
|
8
|
+
Enabled: false
|
|
9
|
+
|
|
10
|
+
CyclomaticComplexity:
|
|
11
|
+
Enabled: false
|
|
12
|
+
|
|
13
|
+
Documentation:
|
|
14
|
+
Enabled: false
|
|
15
|
+
|
|
16
|
+
Metrics/AbcSize:
|
|
17
|
+
Enabled: false
|
|
18
|
+
|
|
19
|
+
Lint/AssignmentInCondition:
|
|
20
|
+
Enabled: false
|
|
21
|
+
|
|
22
|
+
Style/ParenthesesAroundCondition:
|
|
23
|
+
Enabled: false
|
|
24
|
+
|
|
25
|
+
Style/DoubleNegation:
|
|
26
|
+
Enabled: false
|
|
27
|
+
|
|
28
|
+
Style/ConditionalAssignment:
|
|
29
|
+
Enabled: false
|
|
30
|
+
|
|
31
|
+
Style/StringLiterals:
|
|
32
|
+
Exclude:
|
|
33
|
+
- 'spec/data/**/*'
|
|
34
|
+
|
|
35
|
+
Layout/SpaceAfterComma:
|
|
36
|
+
Exclude:
|
|
37
|
+
- 'spec/data/**/*'
|
|
38
|
+
|
|
39
|
+
Style/WordArray:
|
|
40
|
+
Exclude:
|
|
41
|
+
- 'spec/data/**/*'
|
|
42
|
+
|
|
43
|
+
LineLength:
|
|
44
|
+
Max: 120
|
|
45
|
+
Exclude:
|
|
46
|
+
- 'spec/data/**/*'
|
|
47
|
+
|
|
48
|
+
BlockLength:
|
|
49
|
+
Exclude:
|
|
50
|
+
- 'spec/**/*'
|
|
51
|
+
|
|
52
|
+
MethodLength:
|
|
53
|
+
CountComments: false
|
|
54
|
+
Max: 100
|
data/.travis.yml
ADDED
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Change Log
|
|
2
|
+
|
|
3
|
+
## 0.2.1 (14/12/2017)
|
|
4
|
+
- Creating a `spec_helper.rb` and implement SimpleCov tool
|
|
5
|
+
- Refactored test architecture to use hashes instead of arrays
|
|
6
|
+
- Reached 100% code coverage
|
|
7
|
+
|
|
8
|
+
## 0.2.0 (12/12/2017)
|
|
9
|
+
- Implementation of multi functions that allows parsing of both email and phone in same method
|
|
10
|
+
|
|
11
|
+
## 0.1.0 (09/12/2017)
|
|
12
|
+
- Implementation of all basic functions for emails, urls, and phone numbers
|
|
13
|
+
- Initial draft release
|
|
14
|
+
|
|
15
|
+
## 0.0.4 (08/12/2017)
|
|
16
|
+
- Use Ruby that's already installed on TravisCI
|
|
17
|
+
- Implement [Stale](https://github.com/probot/stale) for issue
|
|
18
|
+
- Add error messages, especially for malformed input
|
data/CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
Everyone is welcome to contribute to Ramparts. Contributing doesn’t just mean submitting pull requests—there are many different ways for you to get involved, including answering questions, reporting or triaging [issues](https://github.com/CareGuide/ramparts/issues), et al.
|
|
4
|
+
|
|
5
|
+
No matter how you want to get involved, we ask that you be kind, and treat others on the project with respect.
|
|
6
|
+
|
|
7
|
+
We love pull requests. We'd like to at least comment on, if not
|
|
8
|
+
accept, pull requests within a few days. We may suggest some changes or improvements or alternatives.
|
|
9
|
+
|
|
10
|
+
Some things that will increase the chance that your pull request is accepted:
|
|
11
|
+
|
|
12
|
+
* Make sure the tests pass (this includes linting)
|
|
13
|
+
* Update the documentation: code comments, example code, guides. Basically,
|
|
14
|
+
update everything affected by your contribution.
|
|
15
|
+
* Include any information that would be relevant to reproducing bugs, use cases for new features, etc.
|
|
16
|
+
* If the change does break compatibility, how can it be updated to become backwards compatible, while directing users to the new way of doing things?
|
|
17
|
+
* A suitable and well thought throught commit message
|
|
18
|
+
|
|
19
|
+
# Branch Naming
|
|
20
|
+
|
|
21
|
+
Please follow the following naming convention for branch names. Use `-` to separate words.
|
|
22
|
+
|
|
23
|
+
- Use `feature/...` for feature related changes
|
|
24
|
+
- Use `fix/...` for changes that fix a bug
|
|
25
|
+
- Use `refactor/...` for general refactors of the code (eg. cleaning the code, adding comments)
|
|
26
|
+
- Finally use `update/...` for general updates that aren't refactors or bug fixes but aren't major enough to fall under features
|
|
27
|
+
|
|
28
|
+
# Testing
|
|
29
|
+
|
|
30
|
+
Run linting tests on your branch simply by typing `rubocop` when within the top level directory
|
|
31
|
+
|
|
32
|
+
Run general unit tests simply by typing `rspec` when within the top level directory
|
|
33
|
+
|
|
34
|
+
When writing tests please have the [first three digits](https://en.wikipedia.org/wiki/555_(telephone_number)) (not the area code) of phone numbers as `555` (or the phonetic/l33t equivalent) to avoid collisions with actual phone numbers
|
|
35
|
+
|
|
36
|
+
Please also use `example` for the domain of test email addresses for a similar reason
|
|
37
|
+
|
|
38
|
+
# Stale issue and pull request policy
|
|
39
|
+
|
|
40
|
+
Issues and pull requests have a shelf life and sometimes they are no longer relevant. All issues and pull requests that have not had any activity for 60 days will be marked as `stale`. Simply leave a comment with information about why it may still be relevant to keep it open. If no activity occurs in the next 7 days, the issue will be automatically closed. Stale PR's will be closed manually.
|
|
41
|
+
|
|
42
|
+
The goal of this process is to keep the list of open issues and pull requests focused on work that is actionable and important for the maintainers and the community.
|
|
43
|
+
|
|
44
|
+
# Pull Request Reviews & releasing
|
|
45
|
+
|
|
46
|
+
Once pull request has been given approval, leadership will realease and deploy the gem (See [Roadmap](https://github.com/CareGuide/ramparts/blob/master/ROADMAP.md)) for automatic deploys)
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
GEM
|
|
2
|
+
remote: http://rubygems.org/
|
|
3
|
+
specs:
|
|
4
|
+
ast (2.3.0)
|
|
5
|
+
diff-lcs (1.3)
|
|
6
|
+
docile (1.1.5)
|
|
7
|
+
json (2.1.0)
|
|
8
|
+
parallel (1.12.0)
|
|
9
|
+
parser (2.4.0.2)
|
|
10
|
+
ast (~> 2.3)
|
|
11
|
+
powerpack (0.1.1)
|
|
12
|
+
rainbow (2.2.2)
|
|
13
|
+
rake
|
|
14
|
+
rake (12.3.0)
|
|
15
|
+
rspec (3.7.0)
|
|
16
|
+
rspec-core (~> 3.7.0)
|
|
17
|
+
rspec-expectations (~> 3.7.0)
|
|
18
|
+
rspec-mocks (~> 3.7.0)
|
|
19
|
+
rspec-core (3.7.0)
|
|
20
|
+
rspec-support (~> 3.7.0)
|
|
21
|
+
rspec-expectations (3.7.0)
|
|
22
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
23
|
+
rspec-support (~> 3.7.0)
|
|
24
|
+
rspec-mocks (3.7.0)
|
|
25
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
26
|
+
rspec-support (~> 3.7.0)
|
|
27
|
+
rspec-support (3.7.0)
|
|
28
|
+
rubocop (0.51.0)
|
|
29
|
+
parallel (~> 1.10)
|
|
30
|
+
parser (>= 2.3.3.1, < 3.0)
|
|
31
|
+
powerpack (~> 0.1)
|
|
32
|
+
rainbow (>= 2.2.2, < 3.0)
|
|
33
|
+
ruby-progressbar (~> 1.7)
|
|
34
|
+
unicode-display_width (~> 1.0, >= 1.0.1)
|
|
35
|
+
ruby-progressbar (1.9.0)
|
|
36
|
+
simplecov (0.15.1)
|
|
37
|
+
docile (~> 1.1.0)
|
|
38
|
+
json (>= 1.8, < 3)
|
|
39
|
+
simplecov-html (~> 0.10.0)
|
|
40
|
+
simplecov-html (0.10.2)
|
|
41
|
+
unicode-display_width (1.3.0)
|
|
42
|
+
|
|
43
|
+
PLATFORMS
|
|
44
|
+
ruby
|
|
45
|
+
|
|
46
|
+
DEPENDENCIES
|
|
47
|
+
rspec
|
|
48
|
+
rubocop
|
|
49
|
+
simplecov
|
|
50
|
+
|
|
51
|
+
BUNDLED WITH
|
|
52
|
+
1.16.0
|
data/LICENSE.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2017 CareGuide
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# Ramparts - Spam Detection
|
|
2
|
+
Parses blocks of text to find phone numbers (including phonetic numbers), emails, and spammer urls
|
|
3
|
+
|
|
4
|
+
## Example
|
|
5
|
+
|
|
6
|
+
Find obfuscated phone numbers
|
|
7
|
+
|
|
8
|
+
```
|
|
9
|
+
>> message = "Contact me directly ( FOUR ONE FIVE E I G H T 9 FOUR TWO EIGHT SIX FIVE ). Hope you cracked that number code."
|
|
10
|
+
>> Ramparts.find_phone_numbers(message)
|
|
11
|
+
[{start_offset: 22, end_offset: 71, type: :phone, value: 'FOUR ONE FIVE E I G H T 9 FOUR TOO EIGHT SIX FIVE'}]
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Find obfuscated emails.
|
|
15
|
+
|
|
16
|
+
```
|
|
17
|
+
>> message = "Looking for honest worker .. contact ashley73299 AT yahoo dot com for more info"
|
|
18
|
+
>> Ramparts.find_emails(message)
|
|
19
|
+
[{start_offset: 37, end_offset: 65, type: :email, value: 'ashley73299 AT yahoo dot com'}]
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Find both obfuscated emails and phone numbers.
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
>> message = "Looking for honest worker .. contact ashley73299 AT yahoo dot com or FOUR FIVE ONE 456 8900 for more info"
|
|
26
|
+
>> Ramparts.find_phone_numbers_and_emails(message)
|
|
27
|
+
[{start_offset: 37, end_offset: 65, type: :email, value: 'ashley73299 AT yahoo dot com'}, {start_offset: 70, end_offset: 92, type: :phone, value: 'FOUR FIVE ONE 456 8900'}]
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Count the occurrences of well known spam URLs and keywords
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
>> message = ""cialis vs viagra spam guestbook.php?action=http://cialiswalmart.shop""
|
|
34
|
+
>> Ramparts.count_urls(message)
|
|
35
|
+
3
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## API
|
|
39
|
+
|
|
40
|
+
#### count_phone_numbers(text, options = {})
|
|
41
|
+
- Returns the count of the number of phone numbers in the text. Currently uses a map reduce paradigm,
|
|
42
|
+
which incurs information loss but is cleaner to implement, achieves better results, and is
|
|
43
|
+
**~2x faster** than `find_phone_numbers`
|
|
44
|
+
- **Input:**
|
|
45
|
+
- text **[String]**
|
|
46
|
+
- options **[Hash]**
|
|
47
|
+
- parse_leet **[Boolean][Default → True]**
|
|
48
|
+
- Parses phone numbers that contain l33t syntax. With this set to true eg. `FivE 4 3 F0r On3 67 NiN3` would be caught.
|
|
49
|
+
- remove_spaces **[Boolean][Default → True]**
|
|
50
|
+
- Parses phone numbers that contain spaces between the numbers. With this set to true eg. `F i v E 4 3 F 0 r O n 3 67 N i N 3` would be caught.
|
|
51
|
+
- **Output:**
|
|
52
|
+
- number of occurrences of phone numbers **[Integer]**
|
|
53
|
+
- **Example**
|
|
54
|
+
- **Input:**
|
|
55
|
+
- text → `"If you're interested in this position, do contact me directly on my phone number ( FOUR ONE FIVE E I G H T 9 FOUR TWO EIGHT SIX FIVE ). Hope you cracked that number code."`
|
|
56
|
+
- **Output:** `1`
|
|
57
|
+
|
|
58
|
+
#### find_phone_numbers(text, options = {})
|
|
59
|
+
- **Description:** Finds all occurrences of emails within a block of text. Even when l33t speak, phonetics and
|
|
60
|
+
space variations are used.
|
|
61
|
+
- **Input:**
|
|
62
|
+
- text **[String]**
|
|
63
|
+
- options **[Hash]**
|
|
64
|
+
- To Be Implemented
|
|
65
|
+
- **Output:**
|
|
66
|
+
- **[Array]**
|
|
67
|
+
- match **[Hash]**
|
|
68
|
+
- offset: **[Integer]**
|
|
69
|
+
- value: **[String]**
|
|
70
|
+
- **Example**
|
|
71
|
+
- **Input:**
|
|
72
|
+
- text → `"If you're interested in this position, do contact me directly on my phone number ( FOUR ONE FIVE E I G H T 9 FOUR TWO EIGHT SIX FIVE ). Hope you cracked that number code."`
|
|
73
|
+
- **Output:** `[{start_offset: 84, end_offset: 133, type: :phone, value: 'FOUR ONE FIVE E I G H T 9 FOUR TOO EIGHT SIX FIVE'}]`
|
|
74
|
+
|
|
75
|
+
#### replace_phone_numbers(text, options = {}, &block)
|
|
76
|
+
- **Description:** Replaces all the occurrences of phone numbers within the text with what is returned in the block. Returns the redacted text.
|
|
77
|
+
of text.
|
|
78
|
+
- **Input:**
|
|
79
|
+
- text **[String]**
|
|
80
|
+
- insertable **[String]**
|
|
81
|
+
- options **[Hash]**
|
|
82
|
+
- To Be Implemented
|
|
83
|
+
- **Output:**
|
|
84
|
+
- updated text **[String]**
|
|
85
|
+
- **Example**
|
|
86
|
+
- **Usage:** `altered_text = replace_phone_numbers(...) do CENSORED end`
|
|
87
|
+
- **Input:**
|
|
88
|
+
- text → `"If you're interested in this position, do contact me directly on my phone number ( FOUR ONE FIVE E I G H T 9 FOUR TWO EIGHT SIX FIVE ). Hope you cracked that number code."`
|
|
89
|
+
- **Output:** `"If you're interested in this position, do contact me directly on my phone number ( CENSORED ). Hope you cracked that number code."`
|
|
90
|
+
|
|
91
|
+
#### count_emails(text, options = {})
|
|
92
|
+
- **Description:** Returns the count of the number of emails in the text. Currently uses a map reduce paradigm,
|
|
93
|
+
which incurs information loss but is cleaner to implement, achieves better results, and is **~2x faster**
|
|
94
|
+
than `find_emails`
|
|
95
|
+
- **Input:**
|
|
96
|
+
- text **[String]**
|
|
97
|
+
- options **[Hash]**
|
|
98
|
+
- aggressive **[Boolean] [Default → `False`]**
|
|
99
|
+
- doesn't require a `.` or `dot` + a TLD at the end, but instead compares the last word against a well known list of email domains (eg. `contact ashley @ yandex for more info` would be caught)
|
|
100
|
+
- **Output:**
|
|
101
|
+
- number of occurences of emails **[Integer]**
|
|
102
|
+
- **Example**
|
|
103
|
+
- **Input:**
|
|
104
|
+
- text → `"Hi, Are you seriously interested ..Looking for honest worker .. My e-mail is ashley73299 AT yahoo dot com, I repeat ashley73299 @ yahoo . com ?.. Ashley"`
|
|
105
|
+
- **Output:** `2`
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
#### find_emails(text, options = {})
|
|
109
|
+
- **Description:** Finds all occurrences of emails within a block of text. Even when l33t speak, phonetics are used.
|
|
110
|
+
- **Input:**
|
|
111
|
+
- text **[String]**
|
|
112
|
+
- options **[Hash]**
|
|
113
|
+
- aggressive **[Boolean] [Default → `False`]**
|
|
114
|
+
- doesn't require a `.` or `dot` + a TLD at the end, but instead compares the last word against a well known list of email domains (eg. `contact ashley @ yandex for more info` would be caught)
|
|
115
|
+
- check_for_at **[Boolean] [Default → `False`]**
|
|
116
|
+
- checks for the word 'at' as '@', currently can result in algorithm being overly greedy as 'at' is such a common word
|
|
117
|
+
- **Output:**
|
|
118
|
+
- **[Array]**
|
|
119
|
+
- match **[Hash]**
|
|
120
|
+
- offset: **[Integer]**
|
|
121
|
+
- value: **[String]**
|
|
122
|
+
- **Example**
|
|
123
|
+
- **Input:**
|
|
124
|
+
- text → `"Hi, Are you seriously interested ..Looking for honest worker .. My e-mail is ashley73299 AT yahoo dot com, I repeat ashley73299 @ yahoo . com ?.. Ashley"`
|
|
125
|
+
- **Output:** `[{start_offset: 78, end_offset: 106, type: :email, value: 'ashley73299 AT yahoo dot com'}, {start_offset: 118, end_offset: 143, type: :email, value: 'ashley73299 @ yahoo . com'}]`
|
|
126
|
+
|
|
127
|
+
#### replace_emails(text, options = {}, &block)
|
|
128
|
+
- **Description:** Replaces all the occurrences of emails within the text with what is returned in the block. Returns the redacted text
|
|
129
|
+
of text.
|
|
130
|
+
- **Input:**
|
|
131
|
+
- text **[String]**
|
|
132
|
+
- options **[Hash]**
|
|
133
|
+
- aggressive **[Boolean] [Default → `False`]**
|
|
134
|
+
- doesn't require a `.` or `dot` + a TLD at the end, but instead compares the last word against a well known list of email domains (eg. `contact ashley @ yandex for more info` would be caught)
|
|
135
|
+
- check_for_at **[Boolean] [Default → `False`]**
|
|
136
|
+
- checks for the word 'at' as '@', currently can result in algorithm being overly greedy as 'at' is such a common word
|
|
137
|
+
- **Output:**
|
|
138
|
+
- updated text **[String]**
|
|
139
|
+
- **Example**
|
|
140
|
+
- **Usage:** `altered_text = replace_emails(...) do CENSORED end`
|
|
141
|
+
- **Input:**
|
|
142
|
+
- text → `"My name is Cynthia, a friend of mine needs a nanny to watch her baby in your area, her contact is ( jbush042@gmail.com ) She will be waiting to hear from you kindly send her an email now!"`
|
|
143
|
+
- **Output:** `My name is Cynthia, a friend of mine needs a nanny to watch her baby in your area, her contact is ( CENSORED ) She will be waiting to hear from you kindly send her an email now!`
|
|
144
|
+
|
|
145
|
+
#### count_phone_numbers_and_emails(text, options = {})
|
|
146
|
+
- **Description:** Returns the count of the number of emails in the text. Currently uses a map reduce paradigm,
|
|
147
|
+
which incurs information loss but is cleaner to implement, achieves better results, and is **~2x faster**
|
|
148
|
+
than `find_emails`
|
|
149
|
+
- **Input:**
|
|
150
|
+
- text **[String]**
|
|
151
|
+
- options **[Hash]**
|
|
152
|
+
- parse_leet **[Boolean][Default → True]**
|
|
153
|
+
- Parses phone numbers that contain l33t syntax. With this set to true eg. `FivE 4 3 F0r On3 67 NiN3` would be caught.
|
|
154
|
+
- remove_spaces **[Boolean][Default → True]**
|
|
155
|
+
- Parses phone numbers that contain spaces between the numbers. With this set to true eg. `F i v E 4 3 F 0 r O n 3 67 N i N 3` would be caught.
|
|
156
|
+
- aggressive **[Boolean] [Default → `False`]**
|
|
157
|
+
- doesn't require a `.` or `dot` + a TLD at the end, but instead compares the last word against a well known list of email domains (eg. `contact ashley @ yandex for more info` would be caught)
|
|
158
|
+
- check_for_at **[Boolean] [Default → `False`]**
|
|
159
|
+
- checks for the word 'at' as '@', currently can result in algorithm being overly greedy as 'at' is such a common word
|
|
160
|
+
- **Output:**
|
|
161
|
+
- number of occurences of emails **[Integer]**
|
|
162
|
+
- **Example**
|
|
163
|
+
- **Input:**
|
|
164
|
+
- text → `"Hi, Are you seriously interested ..Looking for honest worker .. My e-mail is ashley73299 AT yahoo dot com, phone 416 090 78 NINE 5 ?.. Ashley"`
|
|
165
|
+
- **Output:** `2`
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
#### find_phone_numbers_and_emails(text, options = {})
|
|
169
|
+
- **Description:** Finds all occurrences of phone numbers and emails within a block of text.
|
|
170
|
+
- **Input:**
|
|
171
|
+
- text **[String]**
|
|
172
|
+
- options **[Hash]**
|
|
173
|
+
- parse_leet **[Boolean][Default → True]**
|
|
174
|
+
- Parses phone numbers that contain l33t syntax. With this set to true eg. `FivE 4 3 F0r On3 67 NiN3` would be caught.
|
|
175
|
+
- remove_spaces **[Boolean][Default → True]**
|
|
176
|
+
- Parses phone numbers that contain spaces between the numbers. With this set to true eg. `F i v E 4 3 F 0 r O n 3 67 N i N 3` would be caught.
|
|
177
|
+
- aggressive **[Boolean] [Default → `False`]**
|
|
178
|
+
- doesn't require a `.` or `dot` + a TLD at the end, but instead compares the last word against a well known list of email domains (eg. `contact ashley @ yandex for more info` would be caught)
|
|
179
|
+
- check_for_at **[Boolean] [Default → `False`]**
|
|
180
|
+
- checks for the word 'at' as '@', currently can result in algorithm being overly greedy as 'at' is such a common word
|
|
181
|
+
- **Output:**
|
|
182
|
+
- **[Array]**
|
|
183
|
+
- match **[Hash]**
|
|
184
|
+
- offset: **[Integer]**
|
|
185
|
+
- value: **[String]**
|
|
186
|
+
- **Example**
|
|
187
|
+
- **Input:**
|
|
188
|
+
- text → `"Hi, Are you seriously interested ..Looking for honest worker .. My e-mail is ashley73299 AT yahoo dot com, phone 416 090 78 NINE 5 ?.. Ashley"`
|
|
189
|
+
- **Output:** `[{start_offset: 78, end_offset: 106, type: :email, value: 'ashley73299 AT yahoo dot com'}, {start_offset: 115, end_offset: 132, type: :phone, value: 'FOUR FIVE ONE 456 8900'}]`
|
|
190
|
+
|
|
191
|
+
#### replace_phone_numbers_and_emails(text, options = {}, &block)
|
|
192
|
+
- **Description:** Replaces all the occurrences of phone numbers and emails within the text with what is returned from the block. Returns the redacted text
|
|
193
|
+
of text.
|
|
194
|
+
- **Input:**
|
|
195
|
+
- text **[String]**
|
|
196
|
+
- options **[Hash]**
|
|
197
|
+
- parse_leet **[Boolean][Default → True]**
|
|
198
|
+
- Parses phone numbers that contain l33t syntax. With this set to true eg. `FivE 4 3 F0r On3 67 NiN3` would be caught.
|
|
199
|
+
- remove_spaces **[Boolean][Default → True]**
|
|
200
|
+
- Parses phone numbers that contain spaces between the numbers. With this set to true eg. `F i v E 4 3 F 0 r O n 3 67 N i N 3` would be caught.
|
|
201
|
+
- aggressive **[Boolean] [Default → `False`]**
|
|
202
|
+
- doesn't require a `.` or `dot` + a TLD at the end, but instead compares the last word against a well known list of email domains (eg. `contact ashley @ yandex for more info` would be caught)
|
|
203
|
+
- check_for_at **[Boolean] [Default → `False`]**
|
|
204
|
+
- checks for the word 'at' as '@', currently can result in algorithm being overly greedy as 'at' is such a common word
|
|
205
|
+
- **Output:**
|
|
206
|
+
- updated text **[String]**
|
|
207
|
+
- **Example**
|
|
208
|
+
- **Usage:** `altered_text = replace_phone_numbers_and_emails(...) do CENSORED end`
|
|
209
|
+
- **Input:**
|
|
210
|
+
- text → `"My name is Cynthia, a friend of mine needs a nanny to watch her baby in your area, her contact is ( jbush042@gmail.com or FOUR FIVE ONE 789 4568 ) She will be waiting to hear from you kindly send her an email now!"`
|
|
211
|
+
- **Output:** `My name is Cynthia, a friend of mine needs a nanny to watch her baby in your area, her contact is ( CENSORED or CENSORED ) She will be waiting to hear from you kindly send her an email now!`
|
|
212
|
+
|
|
213
|
+
#### count_urls(text, options = {})
|
|
214
|
+
- **Description:** Simple union regex to find if the text contains bad urls eg. viagra/cialis. Returns a count of the number of occurrences.
|
|
215
|
+
appear in the text.
|
|
216
|
+
- **Input:**
|
|
217
|
+
- text **[String]**
|
|
218
|
+
- options **[Hash]**
|
|
219
|
+
- To Be Implemented
|
|
220
|
+
- **Output:**
|
|
221
|
+
- number of occurences of matches **[Integer]**
|
|
222
|
+
- **Example**
|
|
223
|
+
- **Input:**
|
|
224
|
+
- text → `"cialis vs cialis spam guestbook.php?action=http://cialiswalmart.shop"`
|
|
225
|
+
- **Output:** `3`
|
|
226
|
+
|