socials_regex 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 47c4cf61a023d67d7615180d178f906ac2f1158c78cab9e3be54beb966405443
4
+ data.tar.gz: 82b1baeda51da998982ca6c9634383273c6346b8c95c0e1281499ab127614291
5
+ SHA512:
6
+ metadata.gz: 87ffe002a52f1710bd071076edc4a7c360f1f83d8a3d627cf33fb916ab1a6f785d84135eaa11c4a57a8ab66fe2f05dbc5af8f8f7f7be00cc70ce107737ec384b
7
+ data.tar.gz: f857e78b0cfa5107eb9e57ed8ae760c879cc3ac175bf9a4675be1901e28f65994329c6af8863f0851ceea37823281a59d6599685896a443ab49619a536d1c57f
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,7 @@
1
+ require:
2
+ - rubocop-performance
3
+ - rubocop-rspec
4
+
5
+ AllCops:
6
+ TargetRubyVersion: 2.6
7
+ NewCops: enable
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## [released]
2
+
3
+ ## [1.0.0] - 2023-07-02
4
+
5
+ - first release
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ # Specify your gem's dependencies in socials_regex.gemspec
6
+ gemspec
7
+
8
+ gem 'rake', '~> 13.0'
9
+
10
+ group :development, :test do
11
+ gem 'rspec', '~> 3.0'
12
+ gem 'rubocop'
13
+ gem 'rubocop-performance'
14
+ gem 'rubocop-rake'
15
+ gem 'rubocop-rspec'
16
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,73 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ socials_regex (1.0.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ ast (2.4.2)
10
+ diff-lcs (1.5.0)
11
+ json (2.6.3)
12
+ parallel (1.23.0)
13
+ parser (3.2.2.1)
14
+ ast (~> 2.4.1)
15
+ rainbow (3.1.1)
16
+ rake (13.0.6)
17
+ regexp_parser (2.8.0)
18
+ rexml (3.2.5)
19
+ rspec (3.12.0)
20
+ rspec-core (~> 3.12.0)
21
+ rspec-expectations (~> 3.12.0)
22
+ rspec-mocks (~> 3.12.0)
23
+ rspec-core (3.12.2)
24
+ rspec-support (~> 3.12.0)
25
+ rspec-expectations (3.12.3)
26
+ diff-lcs (>= 1.2.0, < 2.0)
27
+ rspec-support (~> 3.12.0)
28
+ rspec-mocks (3.12.5)
29
+ diff-lcs (>= 1.2.0, < 2.0)
30
+ rspec-support (~> 3.12.0)
31
+ rspec-support (3.12.0)
32
+ rubocop (1.52.0)
33
+ json (~> 2.3)
34
+ parallel (~> 1.10)
35
+ parser (>= 3.2.0.0)
36
+ rainbow (>= 2.2.2, < 4.0)
37
+ regexp_parser (>= 1.8, < 3.0)
38
+ rexml (>= 3.2.5, < 4.0)
39
+ rubocop-ast (>= 1.28.0, < 2.0)
40
+ ruby-progressbar (~> 1.7)
41
+ unicode-display_width (>= 2.4.0, < 3.0)
42
+ rubocop-ast (1.29.0)
43
+ parser (>= 3.2.1.0)
44
+ rubocop-capybara (2.18.0)
45
+ rubocop (~> 1.41)
46
+ rubocop-factory_bot (2.23.1)
47
+ rubocop (~> 1.33)
48
+ rubocop-performance (1.18.0)
49
+ rubocop (>= 1.7.0, < 2.0)
50
+ rubocop-ast (>= 0.4.0)
51
+ rubocop-rake (0.6.0)
52
+ rubocop (~> 1.0)
53
+ rubocop-rspec (2.22.0)
54
+ rubocop (~> 1.33)
55
+ rubocop-capybara (~> 2.17)
56
+ rubocop-factory_bot (~> 2.22)
57
+ ruby-progressbar (1.13.0)
58
+ unicode-display_width (2.4.2)
59
+
60
+ PLATFORMS
61
+ x86_64-linux
62
+
63
+ DEPENDENCIES
64
+ rake (~> 13.0)
65
+ rspec (~> 3.0)
66
+ rubocop
67
+ rubocop-performance
68
+ rubocop-rake
69
+ rubocop-rspec
70
+ socials_regex!
71
+
72
+ BUNDLED WITH
73
+ 2.4.13
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2023 talaatmagdyx
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,115 @@
1
+ # SocialsRegex
2
+
3
+ ----
4
+ Social Regex Account Detection and Extraction for Ruby. Detect and extract URLs of social accounts: throw in URLs, get back URLs of social media profiles by type.
5
+
6
+ Features:
7
+
8
+ - detect the platform a url points to (all major platforms supported)
9
+ - extract the information contained within the url (without opening the url, of course)
10
+ - extract emails and phone numbers from hyperlinks
11
+
12
+ ## Installation
13
+
14
+ Install the gem and add to the application's Gemfile by executing:
15
+
16
+ $ bundle add socials_regex
17
+
18
+ If bundler is not being used to manage dependencies, install the gem by executing:
19
+
20
+ $ gem install socials_regex
21
+
22
+ ### Requirements
23
+ This gem requires Ruby 2.6+
24
+
25
+ ## Usage
26
+
27
+ ```ruby
28
+ require 'socials_regex'
29
+
30
+ supported_platforms = SocialsRegex::Platforms.all
31
+ # [:PLATFORM_FACEBOOK, :PLATFORM_GITHUB, :PLATFORM_LINKEDIN, :PLATFORM_TWITTER, :PLATFORM_INSTAGRAM, :PLATFORM_YOUTUBE,
32
+ # :PLATFORM_EMAIL, :PLATFORM_HACKER_NEWS, :PLATFORM_MEDIUM, :PLATFORM_PHONE, :PLATFORM_REDDIT,
33
+ # :PLATFORM_SKYPE, :PLATFORM_SNAPCHAT, :PLATFORM_STACKEXCHANGE, :PLATFORM_STACKOVERFLOW, :PLATFORM_STACKOVERFLOW,
34
+ # :PLATFORM_TELEGRAM, :PLATFORM_VIMEO, :PLATFORM_XING, :PLATFORM_ANGELLIST, :PLATFORM_CRUNCHBASE,
35
+ # :PLATFORM_STACKEXCHANGE_NETWORK, :PLATFORM_WHATSAPP, :PLATFORM_YELP]
36
+
37
+
38
+ supported_regexes = SocialsRegex::Regexes.all
39
+ # [:ANGELLIST_URL_REGEX, :CRUNCHBASE_URL_REGEX, :EMAIL_URL_REGEX, :FACEBOOK_URL_REGEX, :GITHUB_URL_REGEX, :HACKERNEWS_URL_REGEX,
40
+ # :INSTAGRAM_URL_REGEX, :LINKEDIN_URL_REGEX, :MEDIUM_URL_REGEX, :PHONE_URL_REGEX, :REDDIT_URL_REGEX, :SKYPE_URL_REGEX, :SNAPCHAT_URL_REGEX,
41
+ # :STACKEXCHANGE_URL_REGEX, :STACKEXCHANGE_NETWORK_URL_REGEX, :STACKOVERFLOW_URL_REGEX, :TELEGRAM_URL_REGEX, :TWITTER_URL_REGEX,
42
+ # :VIMEO_URL_REGEX, :XING_URL_REGEX, :YOUTUBE_URL_REGEX, :WHATSAPP_URL_REGEX, :YELP_URL_REGEX]
43
+
44
+ # get all regex for all regex
45
+ platform_regexes = SocialsRegex::Socials::PLATFORMS_REGEX
46
+ # example [:yelp, {:company=>/(?:https?:\/\/)?(?:www\.)?yelp\.com\/biz\/(?<company>[A-Za-z0-9_-]+)/}]
47
+
48
+ # get regex for specific platforms
49
+ twitter_regex = SocialsRegex::Socials::PLATFORMS_REGEX[:twitter]
50
+ # {:status=>/(?:https?:)?\/\/(?:[A-Za-z]+\.)?twitter\.com\/@?(?<username>[A-Za-z0-9_]+)\/status\/(?<tweet_id>[0-9]+)\/?/,
51
+ # :user=>/(?:https?:)?\/\/(?:[A-Za-z]+\.)?twitter\.com\/@?(?!home|share|privacy|tos)(?<username>[A-Za-z0-9_]+)\/?/}
52
+
53
+
54
+ # how to extract social data from links or texts
55
+ text = 'https://twitter.com/karllorey/status/1259924082067374088' \
56
+ 'https://twitter.com/karllorey12/status/12599240820673740883' \
57
+ 'http://crunchbase.com/organization/acme-corp jeff@amazon.com mailto:plususer+test@gmail.com' \
58
+ 'https://facebook.com/peter.parker https://www.facebook.com/profile.php?id=100004123456789' \
59
+ 'https://github.com/talaatmagdyx https://github.com/talaatmagdyx/socials_regex' \
60
+ 'https://news.ycombinator.com/item?id=23290375 https://instagram.com/__disco__dude' \
61
+ 'https://www.linkedin.com/in/talaatmagdyx/ https://medium.com/does-exist/some-post-123abc'
62
+ extract = SocialsRegex::Extraction.new(text: text)
63
+ # #<SocialsRegex::Extraction:0x00007f5c51d0c488 @text= "https://twitter.com/karllorey/status/......">
64
+
65
+ # to extract all links and data
66
+ extract.extract_matches_per_platform
67
+ # {:crunchbase=>{:company=>[{:matched=>"http://crunchbase.com/organization/acme-corp", "organization"=>"acme-corp"}]},
68
+ # :medium=>{:post=>[{:matched=>"https://medium.com/does-exist/some-post-123abc", "username"=>nil, "publication"=>"does-exist", "slug"=>"some-post", "post_id"=>"123abc"}]},
69
+ # :hackernews=>{:item=>[{:matched=>"https://news.ycombinator.com/item?id=23290375", "item"=>"23290375"}]},
70
+ # :email=>{:email=>[{:matched=>"jeff@amazon.com", "email"=>"jeff@amazon.com"}, {:matched=>"mailto:plususer+test@gmail.comhttps", "email"=>"plususer+test@gmail.comhttps"}]},
71
+ # :instagram=>{:profile=>[{:matched=>"https://instagram.com/__disco__dudehttps", "username"=>"__disco__dudehttps"}]},
72
+
73
+
74
+ # to extract links or data using specific platform like instagram
75
+ extract.extract_matches_by_platform(platform: 'instagram') # or use :instagram
76
+ # {"instagram"=>{:profile=>[{:matched=>"https://instagram.com/__disco__dudehttps", "username"=>"__disco__dudehttps"}]}}
77
+
78
+ # to extract links or data using specific regex like twitter status
79
+ matches = extract.extract_matches_by_regex(regex: SocialsRegex::Regexes::TWITTER_URL_REGEX[:status])
80
+ # [{:matched=>"https://twitter.com/karllorey/status/1259924082067374088", "username"=>"karllorey", "tweet_id"=>"1259924082067374088"},
81
+ # {:matched=>"https://twitter.com/karllorey12/status/12599240820673740883", "username"=>"karllorey12", "tweet_id"=>"12599240820673740883"}]
82
+
83
+
84
+ ```
85
+
86
+ # References
87
+ - [social-media-profiles-regexs](https://github.com/lorey/social-media-profiles-regexs):
88
+ extract urls of social media profiles with regular expressions
89
+
90
+ ## Development
91
+
92
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
93
+
94
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
95
+
96
+ ## Contributing
97
+
98
+ Bug reports and pull requests are welcome on GitHub at [Contributing](https://github.com/talaatmagdyx/socials_regex/blob/master/.github/CONTRIBUTING.md). This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/talaatmagdyx/socials_regex/blob/master/.github/CODE_OF_CONDUCT.md).
99
+
100
+ ## [ChangeLog](./CHANGELOG.md)
101
+
102
+ ## Reporting Bugs / Feature Requests
103
+
104
+ Please [open an Issue on GitHub](https://github.com/talaatmagdyx/socials_regex/issues) if you have feedback, new feature requests, or want to report a bug. Thank you!
105
+
106
+ ## Pull Request
107
+ Please read [Contributing](https://github.com/talaatmagdyx/socials_regex/blob/master/.github/PULL_REQUEST_TEMPLATE.md)
108
+
109
+ ## License
110
+
111
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
112
+
113
+ ## Code of Conduct
114
+
115
+ Everyone interacting in the SocialsRegex project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/talaatmagdyx/socials_regex/blob/master/.github/CODE_OF_CONDUCT.md).
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
data/exe/socials_regex ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'socials_regex'
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SocialsRegex
4
+ class Error < StandardError; end
5
+ end
@@ -0,0 +1,241 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SocialsRegex
4
+ # all supported platforms
5
+ class Platforms
6
+ PLATFORM_FACEBOOK = 'facebook'
7
+ PLATFORM_GITHUB = 'github'
8
+ PLATFORM_LINKEDIN = 'linkedin'
9
+ PLATFORM_TWITTER = 'twitter'
10
+ PLATFORM_INSTAGRAM = 'instagram'
11
+ PLATFORM_YOUTUBE = 'youtube'
12
+ PLATFORM_EMAIL = 'email'
13
+ PLATFORM_HACKER_NEWS = 'hackernews'
14
+ PLATFORM_MEDIUM = 'medium'
15
+ PLATFORM_PHONE = 'phone'
16
+ PLATFORM_REDDIT = 'reddit'
17
+ PLATFORM_SKYPE = 'skype'
18
+ PLATFORM_SNAPCHAT = 'snapchat'
19
+ PLATFORM_STACKEXCHANGE = 'stackexchange'
20
+ PLATFORM_STACKOVERFLOW = 'stackoverflow'
21
+ PLATFORM_TELEGRAM = 'telegram'
22
+ PLATFORM_VIMEO = 'vimeo'
23
+ PLATFORM_XING = 'xing'
24
+ PLATFORM_ANGELLIST = 'angellist'
25
+ PLATFORM_CRUNCHBASE = 'crunchbase'
26
+ PLATFORM_STACKEXCHANGE_NETWORK = 'stackexchange network'
27
+ PLATFORM_WHATSAPP = 'whatsapp'
28
+ PLATFORM_YELP = 'yelp'
29
+
30
+ def self.all
31
+ Platforms.constants
32
+ end
33
+
34
+ def self.show(const_name:)
35
+ Platforms.const_get(const_name)
36
+ rescue NameError
37
+ # raise Error,
38
+ # "#{const_name} platform don't support please read our supported platforms => #{Platforms.all.join(',')}"
39
+ ''
40
+ end
41
+ end
42
+
43
+ # all regex for all platforms
44
+ class Regexes
45
+ ANGELLIST_URL_REGEX = {
46
+ # https://angel.co/company/twitter, https://angel.co/company/twitter/culture
47
+ company: %r{(?:https?:)?//angel\.co/company/(?<company>[A-Za-z0-9_-]+)(?:/(?<company_subpage>[A-Za-z0-9-]+))?},
48
+ # https://angel.co/company/twitter/jobs/576275-engineering-manager
49
+ job: %r{
50
+ (?:https?:)?//angel\.co/company/(?<company>[A-Za-z0-9_-]+)
51
+ /jobs/(?<job_permalink>(?<job_id>[0-9]+)-(?<job_slug>[A-Za-z0-9-]+))
52
+ }x,
53
+ # https://angel.co/p/naval, https://angel.co/u/karllorey
54
+ user: %r{(?:https?:)?//angel\.co/(?<type>u|p)/(?<user>[A-Za-z0-9_-]+)}
55
+ }.freeze
56
+
57
+ CRUNCHBASE_URL_REGEX = {
58
+ # http://crunchbase.com/organization/acme-corp
59
+ company: %r{(?:https?:)?//crunchbase\.com/organization/(?<organization>[A-Za-z0-9_-]+)},
60
+ # http://crunchbase.com/person/karl-lorey
61
+ person: %r{(?:https?:)?//crunchbase\.com/person/(?<person>[A-Za-z0-9_-]+)}
62
+ }.freeze
63
+
64
+ EMAIL_URL_REGEX = {
65
+ # jeff@amazon.com, mailto:jeff@amazon.com, mailto:plususer+test@gmail.com
66
+ email: /(?:mailto:)?(?<email>[A-Za-z0-9_.+-]+@[A-Za-z0-9_.-]+\.[A-Za-z]+)/,
67
+ email_without_extract: /\A[\w+\-.]+@[a-z\d-]+(\.[a-z\d-]+)*\.[a-z]+\z/i
68
+ }.freeze
69
+
70
+ FACEBOOK_URL_REGEX = {
71
+ # http://fb.com/peter_parker-miller, https://facebook.com/peter.parker, https://facebook.com/peterparker
72
+ username: %r{
73
+ (?:https?:)?//(?:www\.)?(?:facebook|fb)\.com/(?<profile>(?![A-Za-z]+\.php)
74
+ (?!marketplace|gaming|watch|me|messages|help|search|groups)[A-Za-z0-9_\-.]+)/?
75
+ }x,
76
+ # https://www.facebook.com/100004123456789, https://www.facebook.com/profile.php?id=100004123456789
77
+ profile_id: %r{(?:https?:)?//(?:www\.)?facebook\.com/(?:profile\.php\?id=)?(?<id>[0-9]+)}
78
+ }.freeze
79
+
80
+ GITHUB_URL_REGEX = {
81
+ # https://github.com/talaatmagdyx/socials_regex
82
+ repo: %r{(?:https?:)?//(?:www\.)?github\.com/(?<login>[A-Za-z0-9_-]+)/(?<repo>[A-Za-z0-9_-]+)/?},
83
+ # https://github.com/talaatmagdyx
84
+ user: %r{(?:https?:)?//(?:www\.)?github\.com/(?<login>[A-Za-z0-9_-]+)/?}
85
+ }.freeze
86
+
87
+ HACKERNEWS_URL_REGEX = {
88
+ # An item can be a post or a direct link to a comment.
89
+ # https://news.ycombinator.com/item?id=23290375
90
+ item: %r{(?:https?:)?//news\.ycombinator\.com/item\?id=(?<item>[0-9]+)},
91
+ # https://news.ycombinator.com/user?id=CamelCaps
92
+ user: %r{(?:https?:)?//news\.ycombinator\.com/user\?id=(?<user>[A-Za-z0-9_-]+)}
93
+ }.freeze
94
+
95
+ INSTAGRAM_URL_REGEX = {
96
+ # The rules:
97
+ # Matches with one . in them disco.dude but not two .. disco..dude
98
+ # Ending period not matched discodude.
99
+ # Match underscores _disco__dude
100
+ # Max characters of 30 1234567890123456789012345678901234567890
101
+ # https://instagram.com/__disco__dude
102
+ profile: %r{(?:https?:)?//(?:www\.)?(?:instagram\.com|instagr\.am)/
103
+ (?<username>[A-Za-z0-9_](?:(?:[A-Za-z0-9_]|(?:\.(?!\.))){0,28}(?:[A-Za-z0-9_]))?)}x
104
+ }.freeze
105
+
106
+ LINKEDIN_URL_REGEX = {
107
+ # https://fr.linkedin.com/school/université-grenoble-alpes/
108
+ company: %r{(?:https?:)?//(?:\w+\.)?linkedin\.com/(?<company_type>
109
+ (?:company|school))/(?<company_permalink>[A-Za-z0-9\-À-ÿ.]+)/?}x,
110
+ # https://www.linkedin.com/feed/update/urn:li:activity:6665508550111912345/
111
+ post: %r{(?:https?:)?//(?:\w+\.)?linkedin\.com/feed/update/urn:li:activity:(?<activity_id>[0-9]+)/?},
112
+ # https://www.linkedin.com/in/talaatmagdyx/
113
+ profile: %r{(?:https?:)?//(?:\w+\.)?linkedin\.com/in/(?<permalink>[\w\-_À-ÿ%]+)/?}
114
+ }.freeze
115
+
116
+ MEDIUM_URL_REGEX = {
117
+ # https://medium.com/does-exist/some-post-123abc
118
+ post: %r{(?:https?:)?//medium\.com/(?:(?:@(?<username>[A-Za-z0-9]+))|(?<publication>[a-z-]+))/
119
+ (?<slug>[a-z0-9-]+)-(?<post_id>[A-Za-z0-9]+)(?:\?.*)?}x,
120
+ # https://onezero.medium.com/what-facebooks-remote-work-policy-means-for-the-future-of-tech-salaries-everywhere-edf859226b62?source=grid_home------
121
+ # Can't match these with the regular post regex as redefinitions of subgroups are not allowed in pythons regex.
122
+ post_of_subdomain_publication: %r{(?:https?:)?//(?<publication>(?!www)[a-z-]+)\.medium\.com/
123
+ (?<slug>[a-z0-9-]+)-(?<post_id>[A-Za-z0-9]+)(?:\?.*)?}x,
124
+ # https://medium.com/@karllorey
125
+ user: %r{(?:https?:)?//medium\.com/@(?<username>[A-Za-z0-9]+)(?:\?.*)?},
126
+ # Now redirects to new user profiles. Follow with a head or get request.
127
+ # https://medium.com/u/b3d3d3653c2c?source=post_page-----da92b81b85ef----------------------
128
+ user_by_id: %r{(?:https?:)?//medium\.com/u/(?<user_id>[A-Za-z0-9]+)(?:\?.*)}
129
+ }.freeze
130
+
131
+ PHONE_URL_REGEX = {
132
+ # Should be cleaned afterwards to strip dots, spaces, etc.
133
+ # tel:+49 900 123456
134
+ # tel:+49900123456
135
+ number: /(?:tel|phone|mobile):(?<number>\+?[0-9. -]+)/
136
+ }.freeze
137
+
138
+ REDDIT_URL_REGEX = {
139
+ # https://old.reddit.com/user/ar-guetita
140
+ # https://reddit.com/u/ar-guetita
141
+ user: %r{(?:https?:)?//(?:[a-z]+\.)?reddit\.com/(?:u(?:ser)?)/(?<username>[A-Za-z0-9\-_]*)/?}
142
+ }.freeze
143
+
144
+ SKYPE_URL_REGEX = {
145
+ # Matches Skype's URLs to add contact, call, chat. More info at Skype SDK's docs.
146
+ # Examples:
147
+ # skype:echo123
148
+ # skype:echo123?call
149
+ profile: /(?:(?:callto|skype):)(?<username>[a-z][a-z0-9.,\-_]{5,31})(?:\?(?:add|call|chat|sendfile|userinfo))?/
150
+ }.freeze
151
+
152
+ SNAPCHAT_URL_REGEX = {
153
+ # https://www.snapchat.com/add/example_user/
154
+ profile: %r{(?:https?:)?//(?:www\.)?snapchat\.com/add/(?<username>[A-Za-z0-9._-]+)/?}
155
+ }.freeze
156
+
157
+ STACKEXCHANGE_URL_REGEX = {
158
+ # This is the meta-platform above stackoverflow, etc. Username can be changed at any time, user_id is persistent.
159
+ # https://www.stackexchange.com/users/12345/example_user/
160
+ user: %r{(?:https?:)?//(?:www\.)?stackexchange\.com/users/(?<id>[0-9]+)/(?<username>[A-Za-z0-9\-_.]+)/?}
161
+ }.freeze
162
+
163
+ STACKEXCHANGE_NETWORK_URL_REGEX = {
164
+ # While there are some "named" communities in the stackexchange network like stackoverflow,
165
+ # many only exist as subdomains, i.e. gaming.stackexchange.com.
166
+ # Again, username can be changed at any time, user_id is persistent.
167
+ # https://gaming.stackexchange.com/users/304007/talaat-magdy
168
+ user: %r{(?:https?:)?//(?:(?<community>[a-z]+(?!www))\.)?stackexchange\.com/users/
169
+ (?<id>[0-9]+)/(?<username>[A-Za-z0-9\-_.]+)/?}x
170
+ }.freeze
171
+
172
+ STACKOVERFLOW_URL_REGEX = {
173
+ # https://stackoverflow.com/questions/12345/how-to-embed
174
+ question: %r{(?:https?:)?//(?:www\.)?stackoverflow\.com/questions/(?<id>[0-9]+)/(?<title>[A-Za-z0-9\-_.]+)/?},
175
+ # Username can be changed at any time, user_id is persistent.
176
+ # https://stackoverflow.com/users/13916928/talaat-magdy
177
+ user: %r{(?:https?:)?//(?:www\.)?stackoverflow\.com/users/(?<id>[0-9]+)/(?<username>[A-Za-z0-9\-_.]+)/?}
178
+ }.freeze
179
+
180
+ TELEGRAM_URL_REGEX = {
181
+ # Matches for t.me, telegram.me and telegram.org.
182
+ # Examples:
183
+ # https://t.me/example_username/
184
+ profile: %r{(?:https?:)?//(?:t(?:elegram)?\.me|telegram\.org)/(?<username>[a-z0-9_]{5,32})/?}
185
+ }.freeze
186
+
187
+ TWITTER_URL_REGEX = {
188
+ # https://twitter.com/karllorey/status/1259924082067374088
189
+ status: %r{(?:https?:)?//(?:[A-Za-z]+\.)?twitter\.com/@?(?<username>[A-Za-z0-9_]+)/status/(?<tweet_id>[0-9]+)/?},
190
+ # Allowed for usernames are alphanumeric characters and underscores.
191
+ # http://twitter.com/@talaatmagdyx
192
+ # http://twitter.com/talaatmagdyx
193
+ # https://twitter.com/talaatmagdyx
194
+ user: %r{(?:https?:)?//(?:[A-Za-z]+\.)?twitter\.com/@?(?!home|share|privacy|tos)(?<username>[A-Za-z0-9_]+)/?}
195
+ }.freeze
196
+
197
+ VIMEO_URL_REGEX = {
198
+ # https://vimeo.com/user12345
199
+ user: %r{(?:https?:)?//vimeo\.com/user(?<id>[0-9]+)},
200
+ # https://vimeo.com/123456789
201
+ # https://player.vimeo.com/video/148751763
202
+ video: %r{(?:https?:)?//(?:(?:www\.)?vimeo\.com|player\.vimeo\.com/video)/(?<id>[0-9]+)}
203
+ }.freeze
204
+
205
+ XING_URL_REGEX = {
206
+ # Default slugs are Firstname_Lastname. If several people with the same name exist, a number is appended.
207
+ # https://www.xing.com/profile/Tobias_Zilbersahn5
208
+ profile: %r{(?:https?:)?//(?:www\.)?xing\.com/profile/(?<slug>[A-z0-9\-_]+)}
209
+ }.freeze
210
+
211
+ YOUTUBE_URL_REGEX = {
212
+ # https://www.youtube.com/channel/UCxyz123456789
213
+ channel: %r{(?:https?:)?//(?:[A-z]+\.)?youtube\.com/channel/(?<id>[A-z0-9\-_]+)},
214
+ # https://www.youtube.com/user/username123
215
+ user: %r{(?:https?:)?//(?:[A-z]+\.)?youtube\.com/user/(?<username>[A-z0-9]+)},
216
+ # https://www.youtube.com/watch?v=dQw4w9WgXcQ
217
+ # https://www.youtube.com/watch?v=dQw4w9WgXcQ
218
+ # https://www.youtube.com/embed/dQw4w9WgXcQ
219
+ # https://www.youtube.com/watch?v=6_b7RDuLwcI
220
+ video: %r{(?:https?:)?//(?:(?:www\.)?youtube\.com/(?:watch\?v=|embed/)|youtu\.be/)(?<id>[A-z0-9\-_]+)}
221
+ }.freeze
222
+
223
+ WHATSAPP_URL_REGEX = {
224
+ # https://wa.me/1234567890
225
+ phone: %r{(?:https?:)?//(?:wa\.me/)(?<number>\+?[0-9. -]+)}
226
+ }.freeze
227
+
228
+ YELP_URL_REGEX = {
229
+ # https://www.yelp.com/biz/example-business
230
+ company: %r{(?:https?://)?(?:www\.)?yelp\.com/biz/(?<company>[A-Za-z0-9_-]+)}
231
+ }.freeze
232
+
233
+ def self.match?(input_str:, regex:)
234
+ input_str.match(regex) ? true : false
235
+ end
236
+
237
+ def self.all
238
+ Regexes.constants
239
+ end
240
+ end
241
+ end
@@ -0,0 +1,100 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SocialsRegex
4
+ class Socials
5
+ PLATFORMS_REGEX = {
6
+ "#{SocialsRegex::Platforms::PLATFORM_YELP}": SocialsRegex::Regexes::YELP_URL_REGEX,
7
+ "#{SocialsRegex::Platforms::PLATFORM_WHATSAPP}": SocialsRegex::Regexes::WHATSAPP_URL_REGEX,
8
+ "#{SocialsRegex::Platforms::PLATFORM_STACKEXCHANGE_NETWORK}":
9
+ SocialsRegex::Regexes::STACKEXCHANGE_NETWORK_URL_REGEX,
10
+ "#{SocialsRegex::Platforms::PLATFORM_CRUNCHBASE}": SocialsRegex::Regexes::CRUNCHBASE_URL_REGEX,
11
+ "#{SocialsRegex::Platforms::PLATFORM_ANGELLIST}": SocialsRegex::Regexes::ANGELLIST_URL_REGEX,
12
+ "#{SocialsRegex::Platforms::PLATFORM_XING}": SocialsRegex::Regexes::XING_URL_REGEX,
13
+ "#{SocialsRegex::Platforms::PLATFORM_VIMEO}": SocialsRegex::Regexes::VIMEO_URL_REGEX,
14
+ "#{SocialsRegex::Platforms::PLATFORM_TELEGRAM}": SocialsRegex::Regexes::TELEGRAM_URL_REGEX,
15
+ "#{SocialsRegex::Platforms::PLATFORM_STACKOVERFLOW}": SocialsRegex::Regexes::STACKOVERFLOW_URL_REGEX,
16
+ "#{SocialsRegex::Platforms::PLATFORM_STACKEXCHANGE}": SocialsRegex::Regexes::STACKEXCHANGE_URL_REGEX,
17
+ "#{SocialsRegex::Platforms::PLATFORM_SNAPCHAT}": SocialsRegex::Regexes::SNAPCHAT_URL_REGEX,
18
+ "#{SocialsRegex::Platforms::PLATFORM_SKYPE}": SocialsRegex::Regexes::SKYPE_URL_REGEX,
19
+ "#{SocialsRegex::Platforms::PLATFORM_REDDIT}": SocialsRegex::Regexes::REDDIT_URL_REGEX,
20
+ "#{SocialsRegex::Platforms::PLATFORM_PHONE}": SocialsRegex::Regexes::PHONE_URL_REGEX,
21
+ "#{SocialsRegex::Platforms::PLATFORM_MEDIUM}": SocialsRegex::Regexes::MEDIUM_URL_REGEX,
22
+ "#{SocialsRegex::Platforms::PLATFORM_HACKER_NEWS}": SocialsRegex::Regexes::HACKERNEWS_URL_REGEX,
23
+ "#{SocialsRegex::Platforms::PLATFORM_EMAIL}": SocialsRegex::Regexes::EMAIL_URL_REGEX,
24
+ "#{SocialsRegex::Platforms::PLATFORM_YOUTUBE}": SocialsRegex::Regexes::YOUTUBE_URL_REGEX,
25
+ "#{SocialsRegex::Platforms::PLATFORM_INSTAGRAM}": SocialsRegex::Regexes::INSTAGRAM_URL_REGEX,
26
+ "#{SocialsRegex::Platforms::PLATFORM_TWITTER}": SocialsRegex::Regexes::TWITTER_URL_REGEX,
27
+ "#{SocialsRegex::Platforms::PLATFORM_LINKEDIN}": SocialsRegex::Regexes::LINKEDIN_URL_REGEX,
28
+ "#{SocialsRegex::Platforms::PLATFORM_GITHUB}": SocialsRegex::Regexes::GITHUB_URL_REGEX,
29
+ "#{SocialsRegex::Platforms::PLATFORM_FACEBOOK}": SocialsRegex::Regexes::FACEBOOK_URL_REGEX
30
+ }.freeze
31
+
32
+ ERROR_MSG_UNKNOWN_PLATFORM = "Unknown platform, expected one of #{PLATFORMS_REGEX.keys}"
33
+ end
34
+
35
+ # Extracted profiles.
36
+ class Extraction
37
+ attr_accessor :text
38
+
39
+ def initialize(text:)
40
+ @text = text
41
+ end
42
+
43
+ # Get lists of profiles keyed by platform name.
44
+ # :return: a dictionary with the platform as a key, and a list of the platform's profiles as values.
45
+ def extract_matches_per_platform
46
+ matches = {}
47
+ Socials::PLATFORMS_REGEX.each do |platform, regexes|
48
+ matches.merge!(platform_matches(regexes: regexes, platform: platform))
49
+ end
50
+ matches
51
+ end
52
+
53
+ # Find all matches for a specific regex.
54
+ # :param regex: platform to search for.
55
+ # :return: list of matches.
56
+ def extract_matches_by_regex(regex:)
57
+ matches(regex: regex)
58
+ end
59
+
60
+ # Find all matches for a specific platform.
61
+ # :param platform: platform to search for.
62
+ # :return: hash of list of matches.
63
+ def extract_matches_by_platform(platform:)
64
+ regexes = Socials::PLATFORMS_REGEX[platform.to_sym]
65
+ raise Socials::ERROR_MSG_UNKNOWN_PLATFORM unless regexes
66
+
67
+ platform_matches(regexes: regexes, platform: platform)
68
+ end
69
+
70
+ private
71
+
72
+ # Find all matches for a specific platform.
73
+ # :param regexes:
74
+ # :params platform:
75
+ # :return: hash of list of matches.
76
+ def platform_matches(regexes:, platform:)
77
+ matches = {}
78
+ regexes.each do |key, regex|
79
+ matched = matches(regex: regex)
80
+ next if matched.empty?
81
+
82
+ matches[platform] ||= {}
83
+ matches[platform][key] = matched
84
+ end
85
+ matches
86
+ end
87
+
88
+ def matches(regex:)
89
+ reformat_matches(matches: text.to_enum(:scan, regex).map { Regexp.last_match })
90
+ end
91
+
92
+ def reformat_matches(matches:)
93
+ reformat = []
94
+ matches.each do |match|
95
+ reformat << { matched: match[0] }.merge!(match.named_captures)
96
+ end
97
+ reformat
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SocialsRegex
4
+ VERSION = '1.0.0'
5
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'socials_regex/version'
4
+
5
+ # main start of gem
6
+ module SocialsRegex
7
+ autoload :Platforms, 'socials_regex/platforms'
8
+ autoload :Regexes, 'socials_regex/platforms'
9
+ autoload :Socials, 'socials_regex/socials'
10
+ autoload :Extraction, 'socials_regex/socials'
11
+ end
@@ -0,0 +1,21 @@
1
+ module SocialsRegex
2
+ class Extraction
3
+ attr_accessor text: string
4
+
5
+ def initialize: (text: string)-> void
6
+
7
+ def extract_matches_per_platform: -> Hash[Symbol, Array[Hash[Symbol, string]]]
8
+
9
+ def extract_matches_by_regex: (regex: Regexp | string) -> Array[Hash[Symbol, string]]
10
+
11
+ def extract_matches_by_platform: (platform: string)-> Hash[Symbol, Array[Hash[Symbol, string]]]
12
+
13
+ private
14
+
15
+ def platform_matches: (regexes: Hash[Symbol, Regexp], platform: string | Symbol) -> Hash[Symbol, Array[Hash[Symbol, string]]]
16
+
17
+ def matches: (regex: Regexp | string) -> Array[Hash[Symbol, string]]
18
+
19
+ def reformat_matches: (matches: Array[MatchData]) -> Array[Hash[Symbol, string]]
20
+ end
21
+ end
@@ -0,0 +1,31 @@
1
+ module SocialsRegex
2
+ class Platforms
3
+ PLATFORM_ANGELLIST: string
4
+ PLATFORM_CRUNCHBASE: string
5
+ PLATFORM_EMAIL: string
6
+ PLATFORM_FACEBOOK: string
7
+ PLATFORM_GITHUB: string
8
+ PLATFORM_HACKER_NEWS: string
9
+ PLATFORM_INSTAGRAM: string
10
+ PLATFORM_LINKEDIN: string
11
+ PLATFORM_MEDIUM: string
12
+ PLATFORM_PHONE: string
13
+ PLATFORM_REDDIT: string
14
+ PLATFORM_SKYPE: string
15
+ PLATFORM_SNAPCHAT: string
16
+ PLATFORM_STACKEXCHANGE: string
17
+ PLATFORM_STACKEXCHANGE_NETWORK: string
18
+ PLATFORM_STACKOVERFLOW: string
19
+ PLATFORM_TELEGRAM: string
20
+ PLATFORM_TWITTER: string
21
+ PLATFORM_VIMEO: string
22
+ PLATFORM_WHATSAPP: string
23
+ PLATFORM_XING: string
24
+ PLATFORM_YELP: string
25
+ PLATFORM_YOUTUBE: string
26
+
27
+ def self.all: -> Array[Symbol]
28
+
29
+ def self.show: (const_name: Symbol | string) -> string
30
+ end
31
+ end
@@ -0,0 +1,31 @@
1
+ module SocialsRegex
2
+ class Regexes
3
+ ANGELLIST_URL_REGEX: Hash[Symbol, Regexp]
4
+ CRUNCHBASE_URL_REGEX: Hash[Symbol, Regexp]
5
+ EMAIL_URL_REGEX: Hash[Symbol, Regexp]
6
+ FACEBOOK_URL_REGEX: Hash[Symbol, Regexp]
7
+ GITHUB_URL_REGEX: Hash[Symbol, Regexp]
8
+ HACKERNEWS_URL_REGEX: Hash[Symbol, Regexp]
9
+ INSTAGRAM_URL_REGEX: Hash[Symbol, Regexp]
10
+ LINKEDIN_URL_REGEX: Hash[Symbol, Regexp]
11
+ MEDIUM_URL_REGEX: Hash[Symbol, Regexp]
12
+ PHONE_URL_REGEX: Hash[Symbol, Regexp]
13
+ REDDIT_URL_REGEX: Hash[Symbol, Regexp]
14
+ SKYPE_URL_REGEX: Hash[Symbol, Regexp]
15
+ SNAPCHAT_URL_REGEX: Hash[Symbol, Regexp]
16
+ STACKEXCHANGE_NETWORK_URL_REGEX: Hash[Symbol, Regexp]
17
+ STACKEXCHANGE_URL_REGEX: Hash[Symbol, Regexp]
18
+ STACKOVERFLOW_URL_REGEX: Hash[Symbol, Regexp]
19
+ TELEGRAM_URL_REGEX: Hash[Symbol, Regexp]
20
+ TWITTER_URL_REGEX: Hash[Symbol, Regexp]
21
+ VIMEO_URL_REGEX: Hash[Symbol, Regexp]
22
+ WHATSAPP_URL_REGEX: Hash[Symbol, Regexp]
23
+ XING_URL_REGEX: Hash[Symbol, Regexp]
24
+ YELP_URL_REGEX: Hash[Symbol, Regexp]
25
+ YOUTUBE_URL_REGEX: Hash[Symbol, Regexp]
26
+
27
+ def self.all: -> Array[Symbol]
28
+
29
+ def self.match?: (input_str: string, regex: Regexp)-> bool
30
+ end
31
+ end
@@ -0,0 +1,6 @@
1
+ module SocialsRegex
2
+ class Socials
3
+ ERROR_MSG_UNKNOWN_PLATFORM: string
4
+ PLATFORMS_REGEX: Hash[Symbol, Hash[Symbol, Regexp]]
5
+ end
6
+ end
@@ -0,0 +1,4 @@
1
+ module SocialsRegex
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: socials_regex
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - talaatmagdyx
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-07-02 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: |-
14
+ Detect and extract URLs of social accounts:
15
+ throw in URLs, get back URLs of social media profiles by type.
16
+ email:
17
+ - talaatmagdy75@gmail.com
18
+ executables:
19
+ - socials_regex
20
+ extensions: []
21
+ extra_rdoc_files: []
22
+ files:
23
+ - ".rspec"
24
+ - ".rubocop.yml"
25
+ - CHANGELOG.md
26
+ - Gemfile
27
+ - Gemfile.lock
28
+ - LICENSE.txt
29
+ - README.md
30
+ - Rakefile
31
+ - exe/socials_regex
32
+ - lib/socials_regex.rb
33
+ - lib/socials_regex/error.rb
34
+ - lib/socials_regex/platforms.rb
35
+ - lib/socials_regex/socials.rb
36
+ - lib/socials_regex/version.rb
37
+ - sig/socials_regex.rbs
38
+ - sig/socials_regex/extraction.rbs
39
+ - sig/socials_regex/platforms.rbs
40
+ - sig/socials_regex/regexes.rbs
41
+ - sig/socials_regex/socials.rbs
42
+ homepage: https://github.com/talaatmagdyx/socials_regex
43
+ licenses:
44
+ - MIT
45
+ metadata:
46
+ homepage_uri: https://github.com/talaatmagdyx/socials_regex
47
+ documentation_uri: https://github.com/talaatmagdyx/socials_regex/wiki
48
+ source_code_uri: https://github.com/talaatmagdyx/socials_regex
49
+ changelog_uri: https://github.com/talaatmagdyx/socials_regex/CHANGELOG.md
50
+ bug_tracker_uri: https://github.com/talaatmagdyx/socials_regex/issues
51
+ wiki_uri: https://github.com/talaatmagdyx/socials_regex/wiki
52
+ rubygems_mfa_required: 'true'
53
+ post_install_message: Thanks for installing!
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: 2.6.0
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ requirements: []
68
+ rubygems_version: 3.4.15
69
+ signing_key:
70
+ specification_version: 4
71
+ summary: Social Account Detection and Extraction for Ruby
72
+ test_files: []