socials_regex 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 47c4cf61a023d67d7615180d178f906ac2f1158c78cab9e3be54beb966405443
4
+ data.tar.gz: 82b1baeda51da998982ca6c9634383273c6346b8c95c0e1281499ab127614291
5
+ SHA512:
6
+ metadata.gz: 87ffe002a52f1710bd071076edc4a7c360f1f83d8a3d627cf33fb916ab1a6f785d84135eaa11c4a57a8ab66fe2f05dbc5af8f8f7f7be00cc70ce107737ec384b
7
+ data.tar.gz: f857e78b0cfa5107eb9e57ed8ae760c879cc3ac175bf9a4675be1901e28f65994329c6af8863f0851ceea37823281a59d6599685896a443ab49619a536d1c57f
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,7 @@
1
+ require:
2
+ - rubocop-performance
3
+ - rubocop-rspec
4
+
5
+ AllCops:
6
+ TargetRubyVersion: 2.6
7
+ NewCops: enable
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## [released]
2
+
3
+ ## [1.0.0] - 2023-07-02
4
+
5
+ - first release
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ # Specify your gem's dependencies in socials_regex.gemspec
6
+ gemspec
7
+
8
+ gem 'rake', '~> 13.0'
9
+
10
+ group :development, :test do
11
+ gem 'rspec', '~> 3.0'
12
+ gem 'rubocop'
13
+ gem 'rubocop-performance'
14
+ gem 'rubocop-rake'
15
+ gem 'rubocop-rspec'
16
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,73 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ socials_regex (1.0.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ ast (2.4.2)
10
+ diff-lcs (1.5.0)
11
+ json (2.6.3)
12
+ parallel (1.23.0)
13
+ parser (3.2.2.1)
14
+ ast (~> 2.4.1)
15
+ rainbow (3.1.1)
16
+ rake (13.0.6)
17
+ regexp_parser (2.8.0)
18
+ rexml (3.2.5)
19
+ rspec (3.12.0)
20
+ rspec-core (~> 3.12.0)
21
+ rspec-expectations (~> 3.12.0)
22
+ rspec-mocks (~> 3.12.0)
23
+ rspec-core (3.12.2)
24
+ rspec-support (~> 3.12.0)
25
+ rspec-expectations (3.12.3)
26
+ diff-lcs (>= 1.2.0, < 2.0)
27
+ rspec-support (~> 3.12.0)
28
+ rspec-mocks (3.12.5)
29
+ diff-lcs (>= 1.2.0, < 2.0)
30
+ rspec-support (~> 3.12.0)
31
+ rspec-support (3.12.0)
32
+ rubocop (1.52.0)
33
+ json (~> 2.3)
34
+ parallel (~> 1.10)
35
+ parser (>= 3.2.0.0)
36
+ rainbow (>= 2.2.2, < 4.0)
37
+ regexp_parser (>= 1.8, < 3.0)
38
+ rexml (>= 3.2.5, < 4.0)
39
+ rubocop-ast (>= 1.28.0, < 2.0)
40
+ ruby-progressbar (~> 1.7)
41
+ unicode-display_width (>= 2.4.0, < 3.0)
42
+ rubocop-ast (1.29.0)
43
+ parser (>= 3.2.1.0)
44
+ rubocop-capybara (2.18.0)
45
+ rubocop (~> 1.41)
46
+ rubocop-factory_bot (2.23.1)
47
+ rubocop (~> 1.33)
48
+ rubocop-performance (1.18.0)
49
+ rubocop (>= 1.7.0, < 2.0)
50
+ rubocop-ast (>= 0.4.0)
51
+ rubocop-rake (0.6.0)
52
+ rubocop (~> 1.0)
53
+ rubocop-rspec (2.22.0)
54
+ rubocop (~> 1.33)
55
+ rubocop-capybara (~> 2.17)
56
+ rubocop-factory_bot (~> 2.22)
57
+ ruby-progressbar (1.13.0)
58
+ unicode-display_width (2.4.2)
59
+
60
+ PLATFORMS
61
+ x86_64-linux
62
+
63
+ DEPENDENCIES
64
+ rake (~> 13.0)
65
+ rspec (~> 3.0)
66
+ rubocop
67
+ rubocop-performance
68
+ rubocop-rake
69
+ rubocop-rspec
70
+ socials_regex!
71
+
72
+ BUNDLED WITH
73
+ 2.4.13
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2023 talaatmagdyx
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,115 @@
1
+ # SocialsRegex
2
+
3
+ ----
4
+ Social Regex Account Detection and Extraction for Ruby. Detect and extract URLs of social accounts: throw in URLs, get back URLs of social media profiles by type.
5
+
6
+ Features:
7
+
8
+ - detect the platform a url points to (all major platforms supported)
9
+ - extract the information contained within the url (without opening the url, of course)
10
+ - extract emails and phone numbers from hyperlinks
11
+
12
+ ## Installation
13
+
14
+ Install the gem and add to the application's Gemfile by executing:
15
+
16
+ $ bundle add socials_regex
17
+
18
+ If bundler is not being used to manage dependencies, install the gem by executing:
19
+
20
+ $ gem install socials_regex
21
+
22
+ ### Requirements
23
+ This gem requires Ruby 2.6+
24
+
25
+ ## Usage
26
+
27
+ ```ruby
28
+ require 'socials_regex'
29
+
30
+ supported_platforms = SocialsRegex::Platforms.all
31
+ # [:PLATFORM_FACEBOOK, :PLATFORM_GITHUB, :PLATFORM_LINKEDIN, :PLATFORM_TWITTER, :PLATFORM_INSTAGRAM, :PLATFORM_YOUTUBE,
32
+ # :PLATFORM_EMAIL, :PLATFORM_HACKER_NEWS, :PLATFORM_MEDIUM, :PLATFORM_PHONE, :PLATFORM_REDDIT,
33
+ # :PLATFORM_SKYPE, :PLATFORM_SNAPCHAT, :PLATFORM_STACKEXCHANGE, :PLATFORM_STACKOVERFLOW, :PLATFORM_STACKOVERFLOW,
34
+ # :PLATFORM_TELEGRAM, :PLATFORM_VIMEO, :PLATFORM_XING, :PLATFORM_ANGELLIST, :PLATFORM_CRUNCHBASE,
35
+ # :PLATFORM_STACKEXCHANGE_NETWORK, :PLATFORM_WHATSAPP, :PLATFORM_YELP]
36
+
37
+
38
+ supported_regexes = SocialsRegex::Regexes.all
39
+ # [:ANGELLIST_URL_REGEX, :CRUNCHBASE_URL_REGEX, :EMAIL_URL_REGEX, :FACEBOOK_URL_REGEX, :GITHUB_URL_REGEX, :HACKERNEWS_URL_REGEX,
40
+ # :INSTAGRAM_URL_REGEX, :LINKEDIN_URL_REGEX, :MEDIUM_URL_REGEX, :PHONE_URL_REGEX, :REDDIT_URL_REGEX, :SKYPE_URL_REGEX, :SNAPCHAT_URL_REGEX,
41
+ # :STACKEXCHANGE_URL_REGEX, :STACKEXCHANGE_NETWORK_URL_REGEX, :STACKOVERFLOW_URL_REGEX, :TELEGRAM_URL_REGEX, :TWITTER_URL_REGEX,
42
+ # :VIMEO_URL_REGEX, :XING_URL_REGEX, :YOUTUBE_URL_REGEX, :WHATSAPP_URL_REGEX, :YELP_URL_REGEX]
43
+
44
+ # get all regex for all regex
45
+ platform_regexes = SocialsRegex::Socials::PLATFORMS_REGEX
46
+ # example [:yelp, {:company=>/(?:https?:\/\/)?(?:www\.)?yelp\.com\/biz\/(?<company>[A-Za-z0-9_-]+)/}]
47
+
48
+ # get regex for specific platforms
49
+ twitter_regex = SocialsRegex::Socials::PLATFORMS_REGEX[:twitter]
50
+ # {:status=>/(?:https?:)?\/\/(?:[A-Za-z]+\.)?twitter\.com\/@?(?<username>[A-Za-z0-9_]+)\/status\/(?<tweet_id>[0-9]+)\/?/,
51
+ # :user=>/(?:https?:)?\/\/(?:[A-Za-z]+\.)?twitter\.com\/@?(?!home|share|privacy|tos)(?<username>[A-Za-z0-9_]+)\/?/}
52
+
53
+
54
+ # how to extract social data from links or texts
55
+ text = 'https://twitter.com/karllorey/status/1259924082067374088' \
56
+ 'https://twitter.com/karllorey12/status/12599240820673740883' \
57
+ 'http://crunchbase.com/organization/acme-corp jeff@amazon.com mailto:plususer+test@gmail.com' \
58
+ 'https://facebook.com/peter.parker https://www.facebook.com/profile.php?id=100004123456789' \
59
+ 'https://github.com/talaatmagdyx https://github.com/talaatmagdyx/socials_regex' \
60
+ 'https://news.ycombinator.com/item?id=23290375 https://instagram.com/__disco__dude' \
61
+ 'https://www.linkedin.com/in/talaatmagdyx/ https://medium.com/does-exist/some-post-123abc'
62
+ extract = SocialsRegex::Extraction.new(text: text)
63
+ # #<SocialsRegex::Extraction:0x00007f5c51d0c488 @text= "https://twitter.com/karllorey/status/......">
64
+
65
+ # to extract all links and data
66
+ extract.extract_matches_per_platform
67
+ # {:crunchbase=>{:company=>[{:matched=>"http://crunchbase.com/organization/acme-corp", "organization"=>"acme-corp"}]},
68
+ # :medium=>{:post=>[{:matched=>"https://medium.com/does-exist/some-post-123abc", "username"=>nil, "publication"=>"does-exist", "slug"=>"some-post", "post_id"=>"123abc"}]},
69
+ # :hackernews=>{:item=>[{:matched=>"https://news.ycombinator.com/item?id=23290375", "item"=>"23290375"}]},
70
+ # :email=>{:email=>[{:matched=>"jeff@amazon.com", "email"=>"jeff@amazon.com"}, {:matched=>"mailto:plususer+test@gmail.comhttps", "email"=>"plususer+test@gmail.comhttps"}]},
71
+ # :instagram=>{:profile=>[{:matched=>"https://instagram.com/__disco__dudehttps", "username"=>"__disco__dudehttps"}]},
72
+
73
+
74
+ # to extract links or data using specific platform like instagram
75
+ extract.extract_matches_by_platform(platform: 'instagram') # or use :instagram
76
+ # {"instagram"=>{:profile=>[{:matched=>"https://instagram.com/__disco__dudehttps", "username"=>"__disco__dudehttps"}]}}
77
+
78
+ # to extract links or data using specific regex like twitter status
79
+ matches = extract.extract_matches_by_regex(regex: SocialsRegex::Regexes::TWITTER_URL_REGEX[:status])
80
+ # [{:matched=>"https://twitter.com/karllorey/status/1259924082067374088", "username"=>"karllorey", "tweet_id"=>"1259924082067374088"},
81
+ # {:matched=>"https://twitter.com/karllorey12/status/12599240820673740883", "username"=>"karllorey12", "tweet_id"=>"12599240820673740883"}]
82
+
83
+
84
+ ```
85
+
86
+ # References
87
+ - [social-media-profiles-regexs](https://github.com/lorey/social-media-profiles-regexs):
88
+ extract urls of social media profiles with regular expressions
89
+
90
+ ## Development
91
+
92
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
93
+
94
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
95
+
96
+ ## Contributing
97
+
98
+ Bug reports and pull requests are welcome on GitHub at [Contributing](https://github.com/talaatmagdyx/socials_regex/blob/master/.github/CONTRIBUTING.md). This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/talaatmagdyx/socials_regex/blob/master/.github/CODE_OF_CONDUCT.md).
99
+
100
+ ## [ChangeLog](./CHANGELOG.md)
101
+
102
+ ## Reporting Bugs / Feature Requests
103
+
104
+ Please [open an Issue on GitHub](https://github.com/talaatmagdyx/socials_regex/issues) if you have feedback, new feature requests, or want to report a bug. Thank you!
105
+
106
+ ## Pull Request
107
+ Please read [Contributing](https://github.com/talaatmagdyx/socials_regex/blob/master/.github/PULL_REQUEST_TEMPLATE.md)
108
+
109
+ ## License
110
+
111
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
112
+
113
+ ## Code of Conduct
114
+
115
+ Everyone interacting in the SocialsRegex project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/talaatmagdyx/socials_regex/blob/master/.github/CODE_OF_CONDUCT.md).
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
data/exe/socials_regex ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'socials_regex'
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SocialsRegex
4
+ class Error < StandardError; end
5
+ end
@@ -0,0 +1,241 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SocialsRegex
4
+ # all supported platforms
5
+ class Platforms
6
+ PLATFORM_FACEBOOK = 'facebook'
7
+ PLATFORM_GITHUB = 'github'
8
+ PLATFORM_LINKEDIN = 'linkedin'
9
+ PLATFORM_TWITTER = 'twitter'
10
+ PLATFORM_INSTAGRAM = 'instagram'
11
+ PLATFORM_YOUTUBE = 'youtube'
12
+ PLATFORM_EMAIL = 'email'
13
+ PLATFORM_HACKER_NEWS = 'hackernews'
14
+ PLATFORM_MEDIUM = 'medium'
15
+ PLATFORM_PHONE = 'phone'
16
+ PLATFORM_REDDIT = 'reddit'
17
+ PLATFORM_SKYPE = 'skype'
18
+ PLATFORM_SNAPCHAT = 'snapchat'
19
+ PLATFORM_STACKEXCHANGE = 'stackexchange'
20
+ PLATFORM_STACKOVERFLOW = 'stackoverflow'
21
+ PLATFORM_TELEGRAM = 'telegram'
22
+ PLATFORM_VIMEO = 'vimeo'
23
+ PLATFORM_XING = 'xing'
24
+ PLATFORM_ANGELLIST = 'angellist'
25
+ PLATFORM_CRUNCHBASE = 'crunchbase'
26
+ PLATFORM_STACKEXCHANGE_NETWORK = 'stackexchange network'
27
+ PLATFORM_WHATSAPP = 'whatsapp'
28
+ PLATFORM_YELP = 'yelp'
29
+
30
+ def self.all
31
+ Platforms.constants
32
+ end
33
+
34
+ def self.show(const_name:)
35
+ Platforms.const_get(const_name)
36
+ rescue NameError
37
+ # raise Error,
38
+ # "#{const_name} platform don't support please read our supported platforms => #{Platforms.all.join(',')}"
39
+ ''
40
+ end
41
+ end
42
+
43
+ # all regex for all platforms
44
+ class Regexes
45
+ ANGELLIST_URL_REGEX = {
46
+ # https://angel.co/company/twitter, https://angel.co/company/twitter/culture
47
+ company: %r{(?:https?:)?//angel\.co/company/(?<company>[A-Za-z0-9_-]+)(?:/(?<company_subpage>[A-Za-z0-9-]+))?},
48
+ # https://angel.co/company/twitter/jobs/576275-engineering-manager
49
+ job: %r{
50
+ (?:https?:)?//angel\.co/company/(?<company>[A-Za-z0-9_-]+)
51
+ /jobs/(?<job_permalink>(?<job_id>[0-9]+)-(?<job_slug>[A-Za-z0-9-]+))
52
+ }x,
53
+ # https://angel.co/p/naval, https://angel.co/u/karllorey
54
+ user: %r{(?:https?:)?//angel\.co/(?<type>u|p)/(?<user>[A-Za-z0-9_-]+)}
55
+ }.freeze
56
+
57
+ CRUNCHBASE_URL_REGEX = {
58
+ # http://crunchbase.com/organization/acme-corp
59
+ company: %r{(?:https?:)?//crunchbase\.com/organization/(?<organization>[A-Za-z0-9_-]+)},
60
+ # http://crunchbase.com/person/karl-lorey
61
+ person: %r{(?:https?:)?//crunchbase\.com/person/(?<person>[A-Za-z0-9_-]+)}
62
+ }.freeze
63
+
64
+ EMAIL_URL_REGEX = {
65
+ # jeff@amazon.com, mailto:jeff@amazon.com, mailto:plususer+test@gmail.com
66
+ email: /(?:mailto:)?(?<email>[A-Za-z0-9_.+-]+@[A-Za-z0-9_.-]+\.[A-Za-z]+)/,
67
+ email_without_extract: /\A[\w+\-.]+@[a-z\d-]+(\.[a-z\d-]+)*\.[a-z]+\z/i
68
+ }.freeze
69
+
70
+ FACEBOOK_URL_REGEX = {
71
+ # http://fb.com/peter_parker-miller, https://facebook.com/peter.parker, https://facebook.com/peterparker
72
+ username: %r{
73
+ (?:https?:)?//(?:www\.)?(?:facebook|fb)\.com/(?<profile>(?![A-Za-z]+\.php)
74
+ (?!marketplace|gaming|watch|me|messages|help|search|groups)[A-Za-z0-9_\-.]+)/?
75
+ }x,
76
+ # https://www.facebook.com/100004123456789, https://www.facebook.com/profile.php?id=100004123456789
77
+ profile_id: %r{(?:https?:)?//(?:www\.)?facebook\.com/(?:profile\.php\?id=)?(?<id>[0-9]+)}
78
+ }.freeze
79
+
80
+ GITHUB_URL_REGEX = {
81
+ # https://github.com/talaatmagdyx/socials_regex
82
+ repo: %r{(?:https?:)?//(?:www\.)?github\.com/(?<login>[A-Za-z0-9_-]+)/(?<repo>[A-Za-z0-9_-]+)/?},
83
+ # https://github.com/talaatmagdyx
84
+ user: %r{(?:https?:)?//(?:www\.)?github\.com/(?<login>[A-Za-z0-9_-]+)/?}
85
+ }.freeze
86
+
87
+ HACKERNEWS_URL_REGEX = {
88
+ # An item can be a post or a direct link to a comment.
89
+ # https://news.ycombinator.com/item?id=23290375
90
+ item: %r{(?:https?:)?//news\.ycombinator\.com/item\?id=(?<item>[0-9]+)},
91
+ # https://news.ycombinator.com/user?id=CamelCaps
92
+ user: %r{(?:https?:)?//news\.ycombinator\.com/user\?id=(?<user>[A-Za-z0-9_-]+)}
93
+ }.freeze
94
+
95
+ INSTAGRAM_URL_REGEX = {
96
+ # The rules:
97
+ # Matches with one . in them disco.dude but not two .. disco..dude
98
+ # Ending period not matched discodude.
99
+ # Match underscores _disco__dude
100
+ # Max characters of 30 1234567890123456789012345678901234567890
101
+ # https://instagram.com/__disco__dude
102
+ profile: %r{(?:https?:)?//(?:www\.)?(?:instagram\.com|instagr\.am)/
103
+ (?<username>[A-Za-z0-9_](?:(?:[A-Za-z0-9_]|(?:\.(?!\.))){0,28}(?:[A-Za-z0-9_]))?)}x
104
+ }.freeze
105
+
106
+ LINKEDIN_URL_REGEX = {
107
+ # https://fr.linkedin.com/school/université-grenoble-alpes/
108
+ company: %r{(?:https?:)?//(?:\w+\.)?linkedin\.com/(?<company_type>
109
+ (?:company|school))/(?<company_permalink>[A-Za-z0-9\-À-ÿ.]+)/?}x,
110
+ # https://www.linkedin.com/feed/update/urn:li:activity:6665508550111912345/
111
+ post: %r{(?:https?:)?//(?:\w+\.)?linkedin\.com/feed/update/urn:li:activity:(?<activity_id>[0-9]+)/?},
112
+ # https://www.linkedin.com/in/talaatmagdyx/
113
+ profile: %r{(?:https?:)?//(?:\w+\.)?linkedin\.com/in/(?<permalink>[\w\-_À-ÿ%]+)/?}
114
+ }.freeze
115
+
116
+ MEDIUM_URL_REGEX = {
117
+ # https://medium.com/does-exist/some-post-123abc
118
+ post: %r{(?:https?:)?//medium\.com/(?:(?:@(?<username>[A-Za-z0-9]+))|(?<publication>[a-z-]+))/
119
+ (?<slug>[a-z0-9-]+)-(?<post_id>[A-Za-z0-9]+)(?:\?.*)?}x,
120
+ # https://onezero.medium.com/what-facebooks-remote-work-policy-means-for-the-future-of-tech-salaries-everywhere-edf859226b62?source=grid_home------
121
+ # Can't match these with the regular post regex as redefinitions of subgroups are not allowed in pythons regex.
122
+ post_of_subdomain_publication: %r{(?:https?:)?//(?<publication>(?!www)[a-z-]+)\.medium\.com/
123
+ (?<slug>[a-z0-9-]+)-(?<post_id>[A-Za-z0-9]+)(?:\?.*)?}x,
124
+ # https://medium.com/@karllorey
125
+ user: %r{(?:https?:)?//medium\.com/@(?<username>[A-Za-z0-9]+)(?:\?.*)?},
126
+ # Now redirects to new user profiles. Follow with a head or get request.
127
+ # https://medium.com/u/b3d3d3653c2c?source=post_page-----da92b81b85ef----------------------
128
+ user_by_id: %r{(?:https?:)?//medium\.com/u/(?<user_id>[A-Za-z0-9]+)(?:\?.*)}
129
+ }.freeze
130
+
131
+ PHONE_URL_REGEX = {
132
+ # Should be cleaned afterwards to strip dots, spaces, etc.
133
+ # tel:+49 900 123456
134
+ # tel:+49900123456
135
+ number: /(?:tel|phone|mobile):(?<number>\+?[0-9. -]+)/
136
+ }.freeze
137
+
138
+ REDDIT_URL_REGEX = {
139
+ # https://old.reddit.com/user/ar-guetita
140
+ # https://reddit.com/u/ar-guetita
141
+ user: %r{(?:https?:)?//(?:[a-z]+\.)?reddit\.com/(?:u(?:ser)?)/(?<username>[A-Za-z0-9\-_]*)/?}
142
+ }.freeze
143
+
144
+ SKYPE_URL_REGEX = {
145
+ # Matches Skype's URLs to add contact, call, chat. More info at Skype SDK's docs.
146
+ # Examples:
147
+ # skype:echo123
148
+ # skype:echo123?call
149
+ profile: /(?:(?:callto|skype):)(?<username>[a-z][a-z0-9.,\-_]{5,31})(?:\?(?:add|call|chat|sendfile|userinfo))?/
150
+ }.freeze
151
+
152
+ SNAPCHAT_URL_REGEX = {
153
+ # https://www.snapchat.com/add/example_user/
154
+ profile: %r{(?:https?:)?//(?:www\.)?snapchat\.com/add/(?<username>[A-Za-z0-9._-]+)/?}
155
+ }.freeze
156
+
157
+ STACKEXCHANGE_URL_REGEX = {
158
+ # This is the meta-platform above stackoverflow, etc. Username can be changed at any time, user_id is persistent.
159
+ # https://www.stackexchange.com/users/12345/example_user/
160
+ user: %r{(?:https?:)?//(?:www\.)?stackexchange\.com/users/(?<id>[0-9]+)/(?<username>[A-Za-z0-9\-_.]+)/?}
161
+ }.freeze
162
+
163
+ STACKEXCHANGE_NETWORK_URL_REGEX = {
164
+ # While there are some "named" communities in the stackexchange network like stackoverflow,
165
+ # many only exist as subdomains, i.e. gaming.stackexchange.com.
166
+ # Again, username can be changed at any time, user_id is persistent.
167
+ # https://gaming.stackexchange.com/users/304007/talaat-magdy
168
+ user: %r{(?:https?:)?//(?:(?<community>[a-z]+(?!www))\.)?stackexchange\.com/users/
169
+ (?<id>[0-9]+)/(?<username>[A-Za-z0-9\-_.]+)/?}x
170
+ }.freeze
171
+
172
+ STACKOVERFLOW_URL_REGEX = {
173
+ # https://stackoverflow.com/questions/12345/how-to-embed
174
+ question: %r{(?:https?:)?//(?:www\.)?stackoverflow\.com/questions/(?<id>[0-9]+)/(?<title>[A-Za-z0-9\-_.]+)/?},
175
+ # Username can be changed at any time, user_id is persistent.
176
+ # https://stackoverflow.com/users/13916928/talaat-magdy
177
+ user: %r{(?:https?:)?//(?:www\.)?stackoverflow\.com/users/(?<id>[0-9]+)/(?<username>[A-Za-z0-9\-_.]+)/?}
178
+ }.freeze
179
+
180
+ TELEGRAM_URL_REGEX = {
181
+ # Matches for t.me, telegram.me and telegram.org.
182
+ # Examples:
183
+ # https://t.me/example_username/
184
+ profile: %r{(?:https?:)?//(?:t(?:elegram)?\.me|telegram\.org)/(?<username>[a-z0-9_]{5,32})/?}
185
+ }.freeze
186
+
187
+ TWITTER_URL_REGEX = {
188
+ # https://twitter.com/karllorey/status/1259924082067374088
189
+ status: %r{(?:https?:)?//(?:[A-Za-z]+\.)?twitter\.com/@?(?<username>[A-Za-z0-9_]+)/status/(?<tweet_id>[0-9]+)/?},
190
+ # Allowed for usernames are alphanumeric characters and underscores.
191
+ # http://twitter.com/@talaatmagdyx
192
+ # http://twitter.com/talaatmagdyx
193
+ # https://twitter.com/talaatmagdyx
194
+ user: %r{(?:https?:)?//(?:[A-Za-z]+\.)?twitter\.com/@?(?!home|share|privacy|tos)(?<username>[A-Za-z0-9_]+)/?}
195
+ }.freeze
196
+
197
+ VIMEO_URL_REGEX = {
198
+ # https://vimeo.com/user12345
199
+ user: %r{(?:https?:)?//vimeo\.com/user(?<id>[0-9]+)},
200
+ # https://vimeo.com/123456789
201
+ # https://player.vimeo.com/video/148751763
202
+ video: %r{(?:https?:)?//(?:(?:www\.)?vimeo\.com|player\.vimeo\.com/video)/(?<id>[0-9]+)}
203
+ }.freeze
204
+
205
+ XING_URL_REGEX = {
206
+ # Default slugs are Firstname_Lastname. If several people with the same name exist, a number is appended.
207
+ # https://www.xing.com/profile/Tobias_Zilbersahn5
208
+ profile: %r{(?:https?:)?//(?:www\.)?xing\.com/profile/(?<slug>[A-z0-9\-_]+)}
209
+ }.freeze
210
+
211
+ YOUTUBE_URL_REGEX = {
212
+ # https://www.youtube.com/channel/UCxyz123456789
213
+ channel: %r{(?:https?:)?//(?:[A-z]+\.)?youtube\.com/channel/(?<id>[A-z0-9\-_]+)},
214
+ # https://www.youtube.com/user/username123
215
+ user: %r{(?:https?:)?//(?:[A-z]+\.)?youtube\.com/user/(?<username>[A-z0-9]+)},
216
+ # https://www.youtube.com/watch?v=dQw4w9WgXcQ
217
+ # https://www.youtube.com/watch?v=dQw4w9WgXcQ
218
+ # https://www.youtube.com/embed/dQw4w9WgXcQ
219
+ # https://www.youtube.com/watch?v=6_b7RDuLwcI
220
+ video: %r{(?:https?:)?//(?:(?:www\.)?youtube\.com/(?:watch\?v=|embed/)|youtu\.be/)(?<id>[A-z0-9\-_]+)}
221
+ }.freeze
222
+
223
+ WHATSAPP_URL_REGEX = {
224
+ # https://wa.me/1234567890
225
+ phone: %r{(?:https?:)?//(?:wa\.me/)(?<number>\+?[0-9. -]+)}
226
+ }.freeze
227
+
228
+ YELP_URL_REGEX = {
229
+ # https://www.yelp.com/biz/example-business
230
+ company: %r{(?:https?://)?(?:www\.)?yelp\.com/biz/(?<company>[A-Za-z0-9_-]+)}
231
+ }.freeze
232
+
233
+ def self.match?(input_str:, regex:)
234
+ input_str.match(regex) ? true : false
235
+ end
236
+
237
+ def self.all
238
+ Regexes.constants
239
+ end
240
+ end
241
+ end
@@ -0,0 +1,100 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SocialsRegex
4
+ class Socials
5
+ PLATFORMS_REGEX = {
6
+ "#{SocialsRegex::Platforms::PLATFORM_YELP}": SocialsRegex::Regexes::YELP_URL_REGEX,
7
+ "#{SocialsRegex::Platforms::PLATFORM_WHATSAPP}": SocialsRegex::Regexes::WHATSAPP_URL_REGEX,
8
+ "#{SocialsRegex::Platforms::PLATFORM_STACKEXCHANGE_NETWORK}":
9
+ SocialsRegex::Regexes::STACKEXCHANGE_NETWORK_URL_REGEX,
10
+ "#{SocialsRegex::Platforms::PLATFORM_CRUNCHBASE}": SocialsRegex::Regexes::CRUNCHBASE_URL_REGEX,
11
+ "#{SocialsRegex::Platforms::PLATFORM_ANGELLIST}": SocialsRegex::Regexes::ANGELLIST_URL_REGEX,
12
+ "#{SocialsRegex::Platforms::PLATFORM_XING}": SocialsRegex::Regexes::XING_URL_REGEX,
13
+ "#{SocialsRegex::Platforms::PLATFORM_VIMEO}": SocialsRegex::Regexes::VIMEO_URL_REGEX,
14
+ "#{SocialsRegex::Platforms::PLATFORM_TELEGRAM}": SocialsRegex::Regexes::TELEGRAM_URL_REGEX,
15
+ "#{SocialsRegex::Platforms::PLATFORM_STACKOVERFLOW}": SocialsRegex::Regexes::STACKOVERFLOW_URL_REGEX,
16
+ "#{SocialsRegex::Platforms::PLATFORM_STACKEXCHANGE}": SocialsRegex::Regexes::STACKEXCHANGE_URL_REGEX,
17
+ "#{SocialsRegex::Platforms::PLATFORM_SNAPCHAT}": SocialsRegex::Regexes::SNAPCHAT_URL_REGEX,
18
+ "#{SocialsRegex::Platforms::PLATFORM_SKYPE}": SocialsRegex::Regexes::SKYPE_URL_REGEX,
19
+ "#{SocialsRegex::Platforms::PLATFORM_REDDIT}": SocialsRegex::Regexes::REDDIT_URL_REGEX,
20
+ "#{SocialsRegex::Platforms::PLATFORM_PHONE}": SocialsRegex::Regexes::PHONE_URL_REGEX,
21
+ "#{SocialsRegex::Platforms::PLATFORM_MEDIUM}": SocialsRegex::Regexes::MEDIUM_URL_REGEX,
22
+ "#{SocialsRegex::Platforms::PLATFORM_HACKER_NEWS}": SocialsRegex::Regexes::HACKERNEWS_URL_REGEX,
23
+ "#{SocialsRegex::Platforms::PLATFORM_EMAIL}": SocialsRegex::Regexes::EMAIL_URL_REGEX,
24
+ "#{SocialsRegex::Platforms::PLATFORM_YOUTUBE}": SocialsRegex::Regexes::YOUTUBE_URL_REGEX,
25
+ "#{SocialsRegex::Platforms::PLATFORM_INSTAGRAM}": SocialsRegex::Regexes::INSTAGRAM_URL_REGEX,
26
+ "#{SocialsRegex::Platforms::PLATFORM_TWITTER}": SocialsRegex::Regexes::TWITTER_URL_REGEX,
27
+ "#{SocialsRegex::Platforms::PLATFORM_LINKEDIN}": SocialsRegex::Regexes::LINKEDIN_URL_REGEX,
28
+ "#{SocialsRegex::Platforms::PLATFORM_GITHUB}": SocialsRegex::Regexes::GITHUB_URL_REGEX,
29
+ "#{SocialsRegex::Platforms::PLATFORM_FACEBOOK}": SocialsRegex::Regexes::FACEBOOK_URL_REGEX
30
+ }.freeze
31
+
32
+ ERROR_MSG_UNKNOWN_PLATFORM = "Unknown platform, expected one of #{PLATFORMS_REGEX.keys}"
33
+ end
34
+
35
+ # Extracted profiles.
36
+ class Extraction
37
+ attr_accessor :text
38
+
39
+ def initialize(text:)
40
+ @text = text
41
+ end
42
+
43
+ # Get lists of profiles keyed by platform name.
44
+ # :return: a dictionary with the platform as a key, and a list of the platform's profiles as values.
45
+ def extract_matches_per_platform
46
+ matches = {}
47
+ Socials::PLATFORMS_REGEX.each do |platform, regexes|
48
+ matches.merge!(platform_matches(regexes: regexes, platform: platform))
49
+ end
50
+ matches
51
+ end
52
+
53
+ # Find all matches for a specific regex.
54
+ # :param regex: platform to search for.
55
+ # :return: list of matches.
56
+ def extract_matches_by_regex(regex:)
57
+ matches(regex: regex)
58
+ end
59
+
60
+ # Find all matches for a specific platform.
61
+ # :param platform: platform to search for.
62
+ # :return: hash of list of matches.
63
+ def extract_matches_by_platform(platform:)
64
+ regexes = Socials::PLATFORMS_REGEX[platform.to_sym]
65
+ raise Socials::ERROR_MSG_UNKNOWN_PLATFORM unless regexes
66
+
67
+ platform_matches(regexes: regexes, platform: platform)
68
+ end
69
+
70
+ private
71
+
72
+ # Find all matches for a specific platform.
73
+ # :param regexes:
74
+ # :params platform:
75
+ # :return: hash of list of matches.
76
+ def platform_matches(regexes:, platform:)
77
+ matches = {}
78
+ regexes.each do |key, regex|
79
+ matched = matches(regex: regex)
80
+ next if matched.empty?
81
+
82
+ matches[platform] ||= {}
83
+ matches[platform][key] = matched
84
+ end
85
+ matches
86
+ end
87
+
88
+ def matches(regex:)
89
+ reformat_matches(matches: text.to_enum(:scan, regex).map { Regexp.last_match })
90
+ end
91
+
92
+ def reformat_matches(matches:)
93
+ reformat = []
94
+ matches.each do |match|
95
+ reformat << { matched: match[0] }.merge!(match.named_captures)
96
+ end
97
+ reformat
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SocialsRegex
4
+ VERSION = '1.0.0'
5
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'socials_regex/version'
4
+
5
+ # main start of gem
6
+ module SocialsRegex
7
+ autoload :Platforms, 'socials_regex/platforms'
8
+ autoload :Regexes, 'socials_regex/platforms'
9
+ autoload :Socials, 'socials_regex/socials'
10
+ autoload :Extraction, 'socials_regex/socials'
11
+ end
@@ -0,0 +1,21 @@
1
+ module SocialsRegex
2
+ class Extraction
3
+ attr_accessor text: string
4
+
5
+ def initialize: (text: string)-> void
6
+
7
+ def extract_matches_per_platform: -> Hash[Symbol, Array[Hash[Symbol, string]]]
8
+
9
+ def extract_matches_by_regex: (regex: Regexp | string) -> Array[Hash[Symbol, string]]
10
+
11
+ def extract_matches_by_platform: (platform: string)-> Hash[Symbol, Array[Hash[Symbol, string]]]
12
+
13
+ private
14
+
15
+ def platform_matches: (regexes: Hash[Symbol, Regexp], platform: string | Symbol) -> Hash[Symbol, Array[Hash[Symbol, string]]]
16
+
17
+ def matches: (regex: Regexp | string) -> Array[Hash[Symbol, string]]
18
+
19
+ def reformat_matches: (matches: Array[MatchData]) -> Array[Hash[Symbol, string]]
20
+ end
21
+ end
@@ -0,0 +1,31 @@
1
+ module SocialsRegex
2
+ class Platforms
3
+ PLATFORM_ANGELLIST: string
4
+ PLATFORM_CRUNCHBASE: string
5
+ PLATFORM_EMAIL: string
6
+ PLATFORM_FACEBOOK: string
7
+ PLATFORM_GITHUB: string
8
+ PLATFORM_HACKER_NEWS: string
9
+ PLATFORM_INSTAGRAM: string
10
+ PLATFORM_LINKEDIN: string
11
+ PLATFORM_MEDIUM: string
12
+ PLATFORM_PHONE: string
13
+ PLATFORM_REDDIT: string
14
+ PLATFORM_SKYPE: string
15
+ PLATFORM_SNAPCHAT: string
16
+ PLATFORM_STACKEXCHANGE: string
17
+ PLATFORM_STACKEXCHANGE_NETWORK: string
18
+ PLATFORM_STACKOVERFLOW: string
19
+ PLATFORM_TELEGRAM: string
20
+ PLATFORM_TWITTER: string
21
+ PLATFORM_VIMEO: string
22
+ PLATFORM_WHATSAPP: string
23
+ PLATFORM_XING: string
24
+ PLATFORM_YELP: string
25
+ PLATFORM_YOUTUBE: string
26
+
27
+ def self.all: -> Array[Symbol]
28
+
29
+ def self.show: (const_name: Symbol | string) -> string
30
+ end
31
+ end
@@ -0,0 +1,31 @@
1
+ module SocialsRegex
2
+ class Regexes
3
+ ANGELLIST_URL_REGEX: Hash[Symbol, Regexp]
4
+ CRUNCHBASE_URL_REGEX: Hash[Symbol, Regexp]
5
+ EMAIL_URL_REGEX: Hash[Symbol, Regexp]
6
+ FACEBOOK_URL_REGEX: Hash[Symbol, Regexp]
7
+ GITHUB_URL_REGEX: Hash[Symbol, Regexp]
8
+ HACKERNEWS_URL_REGEX: Hash[Symbol, Regexp]
9
+ INSTAGRAM_URL_REGEX: Hash[Symbol, Regexp]
10
+ LINKEDIN_URL_REGEX: Hash[Symbol, Regexp]
11
+ MEDIUM_URL_REGEX: Hash[Symbol, Regexp]
12
+ PHONE_URL_REGEX: Hash[Symbol, Regexp]
13
+ REDDIT_URL_REGEX: Hash[Symbol, Regexp]
14
+ SKYPE_URL_REGEX: Hash[Symbol, Regexp]
15
+ SNAPCHAT_URL_REGEX: Hash[Symbol, Regexp]
16
+ STACKEXCHANGE_NETWORK_URL_REGEX: Hash[Symbol, Regexp]
17
+ STACKEXCHANGE_URL_REGEX: Hash[Symbol, Regexp]
18
+ STACKOVERFLOW_URL_REGEX: Hash[Symbol, Regexp]
19
+ TELEGRAM_URL_REGEX: Hash[Symbol, Regexp]
20
+ TWITTER_URL_REGEX: Hash[Symbol, Regexp]
21
+ VIMEO_URL_REGEX: Hash[Symbol, Regexp]
22
+ WHATSAPP_URL_REGEX: Hash[Symbol, Regexp]
23
+ XING_URL_REGEX: Hash[Symbol, Regexp]
24
+ YELP_URL_REGEX: Hash[Symbol, Regexp]
25
+ YOUTUBE_URL_REGEX: Hash[Symbol, Regexp]
26
+
27
+ def self.all: -> Array[Symbol]
28
+
29
+ def self.match?: (input_str: string, regex: Regexp)-> bool
30
+ end
31
+ end
@@ -0,0 +1,6 @@
1
+ module SocialsRegex
2
+ class Socials
3
+ ERROR_MSG_UNKNOWN_PLATFORM: string
4
+ PLATFORMS_REGEX: Hash[Symbol, Hash[Symbol, Regexp]]
5
+ end
6
+ end
@@ -0,0 +1,4 @@
1
+ module SocialsRegex
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: socials_regex
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - talaatmagdyx
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-07-02 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: |-
14
+ Detect and extract URLs of social accounts:
15
+ throw in URLs, get back URLs of social media profiles by type.
16
+ email:
17
+ - talaatmagdy75@gmail.com
18
+ executables:
19
+ - socials_regex
20
+ extensions: []
21
+ extra_rdoc_files: []
22
+ files:
23
+ - ".rspec"
24
+ - ".rubocop.yml"
25
+ - CHANGELOG.md
26
+ - Gemfile
27
+ - Gemfile.lock
28
+ - LICENSE.txt
29
+ - README.md
30
+ - Rakefile
31
+ - exe/socials_regex
32
+ - lib/socials_regex.rb
33
+ - lib/socials_regex/error.rb
34
+ - lib/socials_regex/platforms.rb
35
+ - lib/socials_regex/socials.rb
36
+ - lib/socials_regex/version.rb
37
+ - sig/socials_regex.rbs
38
+ - sig/socials_regex/extraction.rbs
39
+ - sig/socials_regex/platforms.rbs
40
+ - sig/socials_regex/regexes.rbs
41
+ - sig/socials_regex/socials.rbs
42
+ homepage: https://github.com/talaatmagdyx/socials_regex
43
+ licenses:
44
+ - MIT
45
+ metadata:
46
+ homepage_uri: https://github.com/talaatmagdyx/socials_regex
47
+ documentation_uri: https://github.com/talaatmagdyx/socials_regex/wiki
48
+ source_code_uri: https://github.com/talaatmagdyx/socials_regex
49
+ changelog_uri: https://github.com/talaatmagdyx/socials_regex/CHANGELOG.md
50
+ bug_tracker_uri: https://github.com/talaatmagdyx/socials_regex/issues
51
+ wiki_uri: https://github.com/talaatmagdyx/socials_regex/wiki
52
+ rubygems_mfa_required: 'true'
53
+ post_install_message: Thanks for installing!
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: 2.6.0
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ requirements: []
68
+ rubygems_version: 3.4.15
69
+ signing_key:
70
+ specification_version: 4
71
+ summary: Social Account Detection and Extraction for Ruby
72
+ test_files: []