crawler_detect 0.1.9 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +13 -168
- data/.travis.yml +16 -6
- data/CHANGELOG.md +32 -0
- data/Gemfile.lock +119 -0
- data/README.md +13 -0
- data/bin/update_raw_files +10 -0
- data/crawler_detect.gemspec +7 -3
- data/lib/crawler_detect.rb +29 -7
- data/lib/crawler_detect/config.rb +29 -0
- data/lib/crawler_detect/detector.rb +27 -14
- data/lib/crawler_detect/library.rb +9 -3
- data/lib/crawler_detect/library/crawlers.rb +6 -1249
- data/lib/crawler_detect/library/exclusions.rb +6 -50
- data/lib/crawler_detect/library/headers.rb +6 -17
- data/lib/crawler_detect/library/loader.rb +18 -0
- data/lib/crawler_detect/library/raw/Crawlers.json +1 -0
- data/lib/crawler_detect/library/raw/Exclusions.json +1 -0
- data/lib/crawler_detect/library/raw/Headers.json +1 -0
- data/lib/crawler_detect/version.rb +2 -1
- data/lib/rack/crawler_detect.rb +22 -19
- metadata +55 -6
@@ -2,57 +2,13 @@
|
|
2
2
|
|
3
3
|
module CrawlerDetect
|
4
4
|
module Library
|
5
|
+
# @since 0.1.0
|
5
6
|
module Exclusions
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
MSIE.[\d\.]
|
12
|
-
Opera\/[\d\.]*
|
13
|
-
Mozilla.[\d\.]*
|
14
|
-
AppleWebKit.[\d\.]*
|
15
|
-
Trident.[\d\.]*
|
16
|
-
Windows NT.[\d\.]*
|
17
|
-
Android [\d\.]*
|
18
|
-
Macintosh.
|
19
|
-
Ubuntu
|
20
|
-
Linux
|
21
|
-
[ ]Intel
|
22
|
-
Mac OS X [\d_]*
|
23
|
-
(like )?Gecko(.[\d\.]*)?
|
24
|
-
KHTML,
|
25
|
-
CriOS.[\d\.]*
|
26
|
-
CPU iPhone OS ([0-9_])* like Mac OS X
|
27
|
-
CPU OS ([0-9_])* like Mac OS X
|
28
|
-
iPod
|
29
|
-
compatible
|
30
|
-
x86_..
|
31
|
-
i686
|
32
|
-
x64
|
33
|
-
X11
|
34
|
-
rv:[\d\.]*
|
35
|
-
Version.[\d\.]*
|
36
|
-
WOW64
|
37
|
-
Win64
|
38
|
-
Dalvik.[\d\.]*
|
39
|
-
\.NET CLR [\d\.]*
|
40
|
-
Presto.[\d\.]*
|
41
|
-
Media Center PC
|
42
|
-
BlackBerry
|
43
|
-
Build
|
44
|
-
Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.
|
45
|
-
Opera
|
46
|
-
\.NET[\d\.]*
|
47
|
-
cubot
|
48
|
-
; M bot
|
49
|
-
; CRONO
|
50
|
-
; B bot
|
51
|
-
; IDbot
|
52
|
-
; ID bot
|
53
|
-
; POWER BOT
|
54
|
-
;
|
55
|
-
].strip.split(/\n+/).freeze
|
7
|
+
extend Loader
|
8
|
+
|
9
|
+
def self.data
|
10
|
+
@data ||= load_raw(CrawlerDetect.config.settings.raw_exclusions_path).freeze
|
11
|
+
end
|
56
12
|
end
|
57
13
|
end
|
58
14
|
end
|
@@ -2,24 +2,13 @@
|
|
2
2
|
|
3
3
|
module CrawlerDetect
|
4
4
|
module Library
|
5
|
+
# @since 0.1.0
|
5
6
|
module Headers
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
# Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
|
12
|
-
"HTTP_X_DEVICE_USER_AGENT",
|
13
|
-
"HTTP_X_ORIGINAL_USER_AGENT",
|
14
|
-
"HTTP_X_SKYFIRE_PHONE",
|
15
|
-
"HTTP_X_BOLT_PHONE_UA",
|
16
|
-
"HTTP_DEVICE_STOCK_UA",
|
17
|
-
"HTTP_X_UCBROWSER_DEVICE_UA",
|
18
|
-
# Sometimes, bots (especially Google) use a genuine user agent, but fill this header in with their email address
|
19
|
-
"HTTP_FROM",
|
20
|
-
# Seen in use by Netsparker
|
21
|
-
"HTTP_X_SCANNER",
|
22
|
-
].freeze
|
7
|
+
extend Loader
|
8
|
+
|
9
|
+
def self.data
|
10
|
+
@data ||= load_raw(CrawlerDetect.config.settings.raw_headers_path).freeze
|
11
|
+
end
|
23
12
|
end
|
24
13
|
end
|
25
14
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CrawlerDetect
|
4
|
+
module Library
|
5
|
+
# since 1.0.0
|
6
|
+
module Loader
|
7
|
+
# Load JSON raw file
|
8
|
+
def load_raw(path)
|
9
|
+
::Oj.load_file(path)
|
10
|
+
end
|
11
|
+
|
12
|
+
# Remove cached raw data
|
13
|
+
def reload_data
|
14
|
+
remove_instance_variable(:@data) if instance_variable_defined?(:@data)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
[".*Java.*outbrain"," YLT","^b0t$","^bluefish ","^Calypso v\\\/","^COMODO DCV","^DangDang","^DavClnt","^FDM ","^git\\\/","^Goose\\\/","^Grabber","^HTTPClient\\\/","^Java\\\/","^Jeode\\\/","^Jetty\\\/","^Mail\\\/","^Mget","^Microsoft URL Control","^NG\\\/[0-9\\.]","^NING\\\/","^PHP\\\/[0-9]","^RMA\\\/","^Ruby|Ruby\\\/[0-9]","^VSE\\\/[0-9]","^WordPress\\.com","^XRL\\\/[0-9]","^ZmEu","008\\\/","13TABS","192\\.comAgent","2ip\\.ru","404enemy","7Siters","80legs","a\\.pr-cy\\.ru","a3logics\\.in","A6-Indexer","Abonti","Aboundex","aboutthedomain","Accoona-AI-Agent","acebookexternalhit\\\/","acoon","acrylicapps\\.com\\\/pulp","Acunetix","AdAuth\\\/","adbeat","AddThis","ADmantX","AdminLabs","adressendeutschland","adreview\\\/","adscanner","Adstxtaggregator","adstxt-worker","adstxt\\.com","agentslug","AHC","aihit","aiohttp\\\/","Airmail","akka-http\\\/","akula\\\/","alertra","alexa site audit","Alibaba\\.Security\\.Heimdall","Alligator","allloadin","AllSubmitter","alyze\\.info","amagit","^Amazon Simple Notification Service Agent$","Anarchie","AndroidDownloadManager","Anemone","AngleSharp","annotate_google","Ant\\.com","Anturis Agent","AnyEvent-HTTP\\\/","Apache Droid","Apache OpenOffice","Apache-HttpAsyncClient","Apache-HttpClient","ApacheBench","Apexoo","APIs-Google","AportWorm\\\/","AppBeat\\\/","AppEngine-Google","AppleSyndication","Aprc\\\/[0-9]","Arachmo","arachnode","Arachnophilia","aria2","Arukereso","asafaweb","AskQuickly","Ask Jeeves","ASPSeek","Asterias","Astute","asynchttp","Attach","attohttpc","autocite","AutomaticWPTester","Autonomy","axios\\\/","AWS Security Scanner","B-l-i-t-z-B-O-T","Backlink-Ceck","backlink-check","BacklinkHttpStatus","BackStreet","BackupLand","BackWeb","Bad-Neighborhood","Badass","baidu\\.com","Bandit","basicstate","BatchFTP","Battleztar Bazinga","baypup\\\/","BazQux","BBBike","BCKLINKS","BDFetch","BegunAdvertising","Bewica-security-scan","Bidtellect","BigBozz","Bigfoot","biglotron","BingLocalSearch","BingPreview","binlar","biNu image cacher","Bitacle","biz_Directory","Black Hole","Blackboard Safeassign","BlackWidow","BlockNote\\.Net","BlogBridge","Bloglines","Bloglovin","BlogPulseLive","BlogSearch","Blogtrottr","BlowFish","boitho\\.com-dc","Boost\\.Beast","BPImageWalker","Braintree-Webhooks","Branch Metrics API","Branch-Passthrough","Brandprotect","BrandVerity","Brandwatch","Brodie\\\/","Browsershots","BUbiNG","Buck\\\/","Buddy","BuiltWith","Bullseye","BunnySlippers","Burf Search","Butterfly\\\/","BuzzSumo","CAAM\\\/[0-9]","CakePHP","Calculon","Canary%20Mail","CaretNail","catexplorador","CC Metadata Scaper","Cegbfeieh","censys","centuryb.o.t9[at]gmail.com","Cerberian Drtrs","CERT\\.at-Statistics-Survey","cg-eye","changedetection","ChangesMeter","Charlotte","CheckHost","checkprivacy","CherryPicker","ChinaClaw","Chirp\\\/","chkme\\.com","Chlooe","Chromaxa","CirrusExplorer","CISPA Vulnerability Notification","Citoid","CJNetworkQuality","Clarsentia","clips\\.ua\\.ac\\.be","Cloud mapping","CloudEndure","CloudFlare-AlwaysOnline","Cloudflare-Healthchecks","Cloudinary","cmcm\\.com","coccoc","cognitiveseo","colly -","CommaFeed","Commons-HttpClient","commonscan","contactbigdatafr","contentkingapp","convera","CookieReports","copyright sheriff","CopyRightCheck","Copyscape","cortex\\\/","Cosmos4j\\.feedback","Covario-IDS","Craw\\\/","Crescent","Crowsnest","Criteo","CSHttp","CSSCheck","Cula","curb","Curious George","curl","cuwhois\\\/","cybo\\.com","DAP\\\/NetHTTP","DareBoost","DatabaseDriverMysqli","DataCha0s","Datafeedwatch","Datanyze","DataparkSearch","dataprovider","DataXu","Daum(oa)?[ \\\/][0-9]","dBpoweramp","ddline","deeris","delve\\.ai","Demon","DeuSu","developers\\.google\\.com\\\/\\+\\\/web\\\/snippet\\\/","Devil","DHSH","Digg","Digincore","DigitalPebble","Dirbuster","Discourse Forum Onebox","Disqus\\\/","Dispatch\\\/","DittoSpyder","dlvr","DMBrowser","DNSPod-reporting","docoloc","Dolphin http client","DomainAppender","DomainLabz","Donuts Content Explorer","dotMailer content retrieval","dotSemantic","downforeveryoneorjustme","Download Wonder","downnotifier","DowntimeDetector","Drip","drupact","Drupal \\(\\+http:\\\/\\\/drupal\\.org\\\/\\)","DTS Agent","dubaiindex","DuplexWeb-Google","DynatraceSynthetic","EARTHCOM","Easy-Thumb","EasyDL","Ebingbong","ec2linkfinder","eCairn-Grabber","eCatch","ECCP","eContext\\\/","Ecxi","EirGrabber","ElectricMonk","elefent","EMail Exractor","EMail Wolf","EmailWolf","Embarcadero","Embed PHP Library","Embedly","endo\\\/","europarchive\\.org","evc-batch","EventMachine HttpClient","Everwall Link Expander","Evidon","Evrinid","ExactSearch","ExaleadCloudview","Excel\\\/","exif","ExoRank","Exploratodo","Express WebPictures","Extreme Picture Finder","EyeNetIE","ezooms","facebookexternalhit","facebookexternalua","facebookplatform","fairshare","Faraday v","fasthttp","Faveeo","Favicon downloader","faviconkit","faviconarchive","FavOrg","Feed Wrangler","Feedable\\\/","Feedbin","FeedBooster","FeedBucket","FeedBunch\\\/","FeedBurner","feeder","Feedly","FeedshowOnline","Feedspot","FeedViewer\\\/","Feedwind\\\/","FeedZcollector","feeltiptop","Fetch API","Fetch\\\/[0-9]","Fever\\\/[0-9]","FHscan","Filestack","Fimap","findlink","findthatfile","FlashGet","FlipboardBrowserProxy","FlipboardProxy","FlipboardRSS","Flock\\\/","fluffy","Flunky","flynxapp","forensiq","FoundSeoTool","http:\\\/\\\/www.neomo.de\\\/","free thumbnails","Freeuploader","Funnelback","Fuzz Faster U Fool","G-i-g-a-b-o-t","g00g1e\\.net","ganarvisitas","geek-tools","Genieo","GentleSource","GetCode","Getintent","GetLinkInfo","getprismatic","GetRight","getroot","GetURLInfo\\\/","GetWeb","Geziyor","Ghost Inspector","GigablastOpenSource","GIS-LABS","github-camo","github\\.com","Goldfire Server","Go [\\d\\.]* package http","Go http package","Go-Ahead-Got-It","Go-http-client","Go!Zilla","gobyus","gofetch","GomezAgent","gooblog","Goodzer\\\/","Google AppsViewer","Google Desktop","Google favicon","Google Keyword Suggestion","Google Keyword Tool","Google Page Speed Insights","Google PP Default","Google Search Console","Google Web Preview","Google-Ads-Overview","Google-Adwords","Google-Apps-Script","Google-Calendar-Importer","Google-HotelAdsVerifier","Google-HTTP-Java-Client","Google-Publisher-Plugin","Google-Read-Aloud","Google-SearchByImage","Google-Site-Verification","Google-speakr","Google-Structured-Data-Testing-Tool","Google-Youtube-Links","google-xrawler","GoogleDocs","GoogleHC\\\/","GoogleProducer","GoogleSites","Google-Transparency-Report","Gookey","GoSpotCheck","gosquared-thumbnailer","Gotit","GoZilla","grabify","GrabNet","Grafula","Grammarly","GrapeFX","GreatNews","Gregarius","GRequests","grokkit","grouphigh","grub-client","gSOAP\\\/","GT::WWW","GTmetrix","GuzzleHttp","gvfs\\\/","HAA(A)?RTLAND http client","Haansoft","hackney\\\/","Hadi Agent","HappyApps-WebCheck","Hatena","Havij","HaxerMen","HeadlessChrome","HEADMasterSEO","HeartRails_Capture","help@dataminr\\.com","heritrix","Hexometer","historious","hkedcity","hledejLevne\\.cz","Hloader","HMView","Holmes","HonesoSearchEngine","HootSuite Image proxy","Hootsuite-WebFeed","hosterstats","HostTracker","ht:\\\/\\\/check","htdig","HTMLparser","htmlyse","HTTP Banner Detection","HTTP_Compression_Test","http_request2","http_requester","http-get","HTTP-Header-Abfrage","http-kit","http-request\\\/","HTTP-Tiny","HTTP::Lite","http\\.rb\\\/","http_get","HttpComponents","httphr","HTTPMon","HTTPie","httpRequest","httpscheck","httpssites_power","httpunit","HttpUrlConnection","httrack","huaweisymantec","HubSpot ","Humanlinks","i2kconnect\\\/","Iblog","ichiro","Id-search","IdeelaborPlagiaat","IDG Twitter Links Resolver","IDwhois\\\/","Iframely","igdeSpyder","iGooglePortal","IlTrovatore","Image Fetch","Image Sucker","ImageEngine\\\/","ImageVisu\\\/","Imagga","imagineeasy","imgsizer","InAGist","inbound\\.li parser","InDesign%20CC","Indy Library","InetURL","infegy","infohelfer","InfoTekies","InfoWizards Reciprocal Link","inpwrd\\.com","instabid","Instapaper","Integrity","integromedb","Intelliseek","InterGET","internet_archive","Internet Ninja","InternetSeer","internetVista monitor","internetwache","intraVnews","IODC","IOI","iplabel","ips-agent","IPS\\\/[0-9]","IPWorks HTTP\\\/S Component","iqdb\\\/","Iria","Irokez","isitup\\.org","iskanie","isUp\\.li","iThemes Sync\\\/","IZaBEE","iZSearch","JAHHO","janforman","Jaunt\\\/","Jbrofuzz","Jersey\\\/","JetCar","Jigsaw","Jobboerse","JobFeed discovery","Jobg8 URL Monitor","jobo","Jobrapido","Jobsearch1\\.5","JoinVision Generic","JolokiaPwn","Joomla","Jorgee","JS-Kit","JustView","Kaspersky Lab CFR link resolver","Kelny\\\/","Kerrigan\\\/","KeyCDN","Keyword Density","Keywords Research","khttp\\\/","KickFire","KimonoLabs\\\/","Kml-Google","knows\\.is","KOCMOHABT","kouio","kubectl","kube-probe","kulturarw3","KumKie","L\\.webis","Larbin","Lavf\\\/","LeechFTP","LeechGet","letsencrypt","Lftp","LibVLC","LibWeb","Libwhisker","libwww","Licorne","Liferea\\\/","Lightspeedsystems","Lighthouse","Likse","limber\\.io","Link Valet","link_thumbnailer","LinkAlarm\\\/","linkCheck","linkdex","LinkExaminer","linkfluence","linkpeek","LinkPreviewGenerator","LinkScan","LinksManager","LinkTiger","LinkWalker","Lipperhey","Litemage_walker","livedoor ScreenShot","LoadImpactRload","localsearch-web","LongURL API","longurl-r-package","looid\\.com","looksystems\\.net","ltx71","lua-resty-http","lwp-request","lwp-trivial","LWP::Simple","lycos","LYT\\.SR","mabontland","Mag-Net","MagpieRSS","Mail\\.Ru","MailChimp","Majestic12","makecontact\\\/","Mandrill","MapperCmd","marketinggrader","MarkMonitor","MarkWatch","Mass Downloader","masscan\\\/","Mata Hari","Mediametric","Mediapartners-Google","mediawords","MegaIndex\\.ru","MeltwaterNews","Melvil Rawi","MemGator","Metaspinner","MetaURI","MFC_Tear_Sample","MicroMessenger\\\/","Microsearch","Microsoft Office ","Microsoft Outlook","Microsoft Windows Network Diagnostics","Microsoft-WebDAV-MiniRedir","Microsoft Data Access","MIDown tool","MIIxpc","Mindjet","Miniature\\.io","Miniflux","Mister PiX","mixdata dot com","mixed-content-scan","Mixmax-LinkPreview","mixnode","Mnogosearch","mogimogi","Mojeek","Mojolicious \\(Perl\\)","Monit\\\/","monitis","Monitority\\\/","montastic","MonTools","Moreover","Morfeus Fucking Scanner","Morning Paper","MovableType","mowser","Mr\\.4x3 Powered","Mrcgiguy","MS Web Services Client Protocol","MSFrontPage","mShots","MuckRack\\\/","muhstik-scan","MVAClient","MxToolbox\\\/","nagios","Najdi\\.si","Name Intelligence","Nameprotect","Navroad","NearSite","Needle","Nessus","Net Vampire","NetAnts","NETCRAFT","NetLyzer","NetMechanic","NetNewsWire","Netpursual","netresearch","NetShelter ContentScan","Netsparker","NetTrack","Netvibes","NetZIP","Neustar WPM","NeutrinoAPI","NewRelicPinger","NewsBlur .*Finder","NewsGator","newsme","newspaper\\\/","NetSystemsResearch","Nexgate Ruby Client","NG-Search","Nibbler","NICErsPRO","Nikto","nineconnections","NLNZ_IAHarvester","Nmap Scripting Engine","node-superagent","node-urllib","node\\.io","Nodemeter","NodePing","nominet\\.org\\.uk","nominet\\.uk","Norton-Safeweb","Notifixious","notifyninja","NotionEmbedder","nuhk","nutch","Nuzzel","nWormFeedFinder","nyawc\\\/","Nymesis","NYU","Ocelli\\\/","Octopus","oegp","Offline Explorer","Offline Navigator","OgScrper","okhttp","omgili","OMSC","Online Domain Tools","OpenCalaisSemanticProxy","Openfind","OpenLinkProfiler","Openstat\\\/","OpenVAS","OPPO A33","Optimizer","Orbiter","OrgProbe\\\/","orion-semantics","Outlook-Express","Outlook-iOS","ow\\.ly","Owler","Owlin","ownCloud News","OxfordCloudService","Page Valet","page_verifier","page scorer","page2rss","PageFreezer","PageGrabber","PagePeeker","PageScorer","Pagespeed\\\/","Panopta","panscient","Papa Foto","parsijoo","Pavuk","PayPal IPN","pcBrowser","Pcore-HTTP","Pearltrees","PECL::HTTP","peerindex","Peew","PeoplePal","Perlu -","PhantomJS Screenshoter","PhantomJS\\\/","Photon\\\/","phpservermon","Pi-Monster","Picscout","Picsearch","PictureFinder","Pimonster","ping\\.blo\\.gs","Pingability","PingAdmin\\.Ru","Pingdom","Pingoscope","PingSpot","pinterest\\.com","Pixray","Pizilla","Plagger\\\/","Ploetz \\+ Zeller","Plukkie","plumanalytics","PocketImageCache","PocketParser","Pockey","POE-Component-Client-HTTP","Polymail\\\/","Pompos","Porkbun","Port Monitor","postano","PostmanRuntime","PostPost","postrank","PowerPoint\\\/","Prebid","Priceonomics Analysis Engine","PrintFriendly","PritTorrent","Prlog","probethenet","Project 25499","prospectb2b","Protopage","ProWebWalker","proximic","PRTG Network Monitor","pshtt, https scanning","PTST ","PTST\\\/[0-9]+","Pump","python-httpx","Python-httplib2","python-requests","Python-urllib","Qirina Hurdler","QQDownload","QrafterPro","Qseero","Qualidator","QueryN Metasearch","queuedriver","Quora Link Preview","Qwantify","Radian6","RankActive","RankFlex","RankSonicSiteAuditor","Re-re Studio","ReactorNetty","Readability","RealDownload","RealPlayer%20Downloader","RebelMouse","Recorder","RecurPost\\\/","redback\\\/","ReederForMac","Reeder\\\/","ReGet","RepoMonkey","request\\.js","reqwest\\\/","ResponseCodeTest","RestSharp","Riddler","Rival IQ","Robosourcer","Robozilla","ROI Hunter","RPT-HTTPClient","RSSOwl","RyowlEngine","safe-agent-scanner","SalesIntelligent","Saleslift","Sendsay\\.Ru","SauceNAO","SBIder","sc-downloader","scalaj-http","Scamadviser-Frontend","scan\\.lol","ScanAlert","Scoop","scooter","ScoutJet","ScoutURLMonitor","ScrapeBox Page Scanner","Scrapy","Screaming","ScreenShotService","Scrubby","Scrutiny\\\/","search\\.thunderstone","Search37","searchenginepromotionhelp","Searchestate","SearchExpress","SearchSight","Seeker","semanticdiscovery","semanticjuice","Semiocast HTTP client","Semrush","sentry\\\/","SEO Browser","Seo Servis","seo-nastroj\\.cz","seo4ajax","Seobility","SEOCentro","SeoCheck","SEOkicks","SEOlizer","Seomoz","SEOprofiler","SEOsearch","seoscanners","seositecheckup","SEOstats","servernfo","sexsearcher","Seznam","Shelob","Shodan","Shoppimon","ShopWiki","shortURL lengthener","ShortLinkTranslate","shrinktheweb","Sideqik","Siege","SimplePie","SimplyFast","Siphon","SISTRIX","Site-Shot\\\/","Site Sucker","Site24x7","SiteBar","Sitebeam","Sitebulb\\\/","SiteCondor","SiteExplorer","SiteGuardian","Siteimprove","SiteIndexed","Sitemap(s)? Generator","SitemapGenerator","SiteMonitor","Siteshooter B0t","SiteSnagger","SiteSucker","SiteTruth","Sitevigil","sitexy\\.com","SkypeUriPreview","Slack\\\/","slider\\.com","slurp","SlySearch","SmartDownload","SMRF URL Expander","SMUrlExpander","Snake","Snappy","SnapSearch","Snarfer\\\/","SniffRSS","sniptracker","Snoopy","SnowHaze Search","sogou web","SortSite","Sottopop","sovereign\\.ai","SpaceBison","SpamExperts","Spammen","Spanner","spaziodati","SPDYCheck","Specificfeeds","speedy","SPEng","Spinn3r","spray-can","Sprinklr ","spyonweb","sqlmap","Sqlworm","Sqworm","SSL Labs","ssl-tools","StackRambler","Statastico\\\/","StatusCake","Steeler","Stratagems Kumo","Stroke\\.cz","StudioFACA","StumbleUpon","suchen","Sucuri","summify","SuperHTTP","Surphace Scout","Suzuran","Symfony BrowserKit","Symfony2 BrowserKit","SynHttpClient-Built","Sysomos","sysscan","Szukacz","T0PHackTeam","tAkeOut","Tarantula\\\/","Taringa UGC","TarmotGezgin","Teleport","Telesoft","Telesphoreo","Telesphorep","Tenon\\.io","teoma","terrainformatica","Test Certificate Info","testuri","Tetrahedron","TextRazor Downloader","The Drop Reaper","The Expert HTML Source Viewer","The Knowledge AI","The Intraformant","theinternetrules","TheNomad","Thinklab","Thumbshots","ThumbSniper","Thumbor","timewe\\.net","TinEye","Tiny Tiny RSS","TLSProbe\\\/","Toata","topster","touche\\.com","Traackr\\.com","tracemyfile","Trackuity","TrapitAgent","Trendiction","Trendsmap","trendspottr","truwoGPS","TryJsoup","TulipChain","Turingos","Turnitin","tweetedtimes","Tweetminster","Tweezler\\\/","twibble","Twice","Twikle","Twingly","Twisted PageGetter","Typhoeus","ubermetrics-technologies","uclassify","UdmSearch","unchaos","unirest-java","UniversalFeedParser","Unshorten\\.It","Untiny","UnwindFetchor","updated","updown\\.io daemon","Upflow","Uptimia","Urlcheckr","URL Verifier","URLitor","urlresolver","Urlstat","URLTester","UrlTrends Ranking Updater","URLy Warning","URLy\\.Warning","Vacuum","Vagabondo","VB Project","vBSEO","VCI","via ggpht\\.com GoogleImageProxy","Virusdie","visionutils","vkShare","VoidEYE","Voil","voltron","voyager\\\/","VSAgent\\\/","VSB-TUO\\\/","Vulnbusters Meter","VYU2","w3af\\.org","W3C_Unicorn","W3C-checklink","W3C-mobileOK","WAC-OFU","Wallpapers\\\/[0-9]+","WallpapersHD","wangling","Wappalyzer","WatchMouse","WbSrch\\\/","WDT\\.io","web-capture\\.net","Web-sniffer","Web Auto","Web Collage","Web Enhancer","Web Fetch","Web Fuck","Web Pix","Web Sauger","Web spyder","Web Sucker","Webalta","Webauskunft","WebAuto","WebCapture","WebClient\\\/","webcollage","WebCookies","WebCopier","WebCorp","WebDataStats","WebDoc","WebEnhancer","WebFetch","WebFuck","WebGazer","WebGo IS","WebImageCollector","WebImages","WebIndex","webkit2png","WebLeacher","webmastercoffee","webmon ","WebPix","WebReaper","WebSauger","webscreenie","Webshag","Webshot","Website Quester","websitepulse agent","WebsiteQuester","Websnapr","WebSniffer","Webster","WebStripper","WebSucker","Webthumb\\\/","WebThumbnail","WebWhacker","WebZIP","WeLikeLinks","WEPA","WeSEE","wf84","Wfuzz\\\/","wget","WhatCMS","WhatsApp","WhatsMyIP","WhatWeb","WhereGoes\\?","Whibse","WhoAPI\\\/","WhoRunsCoinHive","Whynder Magnet","WinHttp-Autoproxy-Service","Windows-RSS-Platform","WinPodder","wkhtmlto","wmtips","Woko","Wolfram HTTPClient","woorankreview","Word\\\/","WordPress\\\/","worldping-api","WordupinfoSearch","wotbox","WP Engine Install Performance API","wpif","wprecon\\.com survey","WPScan","wscheck","Wtrace","WWW-Collector-E","WWW-Mechanize","WWW::Document","WWW::Mechanize","www\\.monitor\\.us","WWWOFFLE","x09Mozilla","x22Mozilla","XaxisSemanticsClassifier","Xenu Link Sleuth","XING-contenttabreceiver","xpymep([0-9]?)\\.exe","Y!J-(ASR|BSC)","Y\\!J-BRW","Yaanb","yacy","Yahoo Link Preview","YahooCacheSystem","YahooYSMcm","YandeG","Yandex(?!Search)","yanga","yeti","Yo-yo","Yoleo Consumer","yoogliFetchAgent","YottaaMonitor","Your-Website-Sucks","yourls\\.org","YoYs\\.net","YP\\.PL","Zabbix","Zade","Zao","Zauba","Zemanta Aggregator","Zend_Http_Client","Zend\\\\Http\\\\Client","Zermelo","Zeus ","zgrab","ZnajdzFoto","ZnHTTP","Zombie\\.js","Zoom\\.Mac","ZyBorg","[a-z0-9\\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer|scraper)"]
|
@@ -0,0 +1 @@
|
|
1
|
+
["Safari.[\\d\\.]*","Firefox.[\\d\\.]*"," Chrome.[\\d\\.]*","Chromium.[\\d\\.]*","MSIE.[\\d\\.]","Opera\\\/[\\d\\.]*","Mozilla.[\\d\\.]*","AppleWebKit.[\\d\\.]*","Trident.[\\d\\.]*","Windows NT.[\\d\\.]*","Android [\\d\\.]*","Macintosh.","Ubuntu","Linux","[ ]Intel","Mac OS X [\\d_]*","(like )?Gecko(.[\\d\\.]*)?","KHTML,","CriOS.[\\d\\.]*","CPU iPhone OS ([0-9_])* like Mac OS X","CPU OS ([0-9_])* like Mac OS X","iPod","compatible","x86_..","i686","x64","X11","rv:[\\d\\.]*","Version.[\\d\\.]*","WOW64","Win64","Dalvik.[\\d\\.]*"," \\.NET CLR [\\d\\.]*","Presto.[\\d\\.]*","Media Center PC","BlackBerry","Build","Opera Mini\\\/\\d{1,2}\\.\\d{1,2}\\.[\\d\\.]*\\\/\\d{1,2}\\.","Opera"," \\.NET[\\d\\.]*","cubot","; M bot","; CRONO","; B bot","; IDbot","; ID bot","; POWER BOT",";"]
|
@@ -0,0 +1 @@
|
|
1
|
+
["HTTP_USER_AGENT","HTTP_X_OPERAMINI_PHONE_UA","HTTP_X_DEVICE_USER_AGENT","HTTP_X_ORIGINAL_USER_AGENT","HTTP_X_SKYFIRE_PHONE","HTTP_X_BOLT_PHONE_UA","HTTP_DEVICE_STOCK_UA","HTTP_X_UCBROWSER_DEVICE_UA","HTTP_FROM","HTTP_X_SCANNER"]
|
data/lib/rack/crawler_detect.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Rack
|
4
|
+
# Rack-based interface to detect crawlers
|
5
|
+
#
|
6
|
+
# @since 0.1.0
|
4
7
|
class CrawlerDetect
|
5
8
|
def initialize(app, options = {})
|
6
9
|
Rack::Request::Helpers.module_eval do
|
@@ -16,30 +19,30 @@ module Rack
|
|
16
19
|
end
|
17
20
|
|
18
21
|
def call(env)
|
19
|
-
|
20
|
-
|
21
|
-
@app.call(@env)
|
22
|
+
set_env_variables!(env)
|
23
|
+
@app.call(env)
|
22
24
|
end
|
23
25
|
|
24
26
|
private
|
25
27
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
28
|
+
def set_env_variables!(env)
|
29
|
+
ua = user_agent(env)
|
30
|
+
return unless ua
|
31
|
+
detector = ::CrawlerDetect::Detector.new(ua)
|
32
|
+
env["rack.crawler_detect"] = {
|
33
|
+
is_crawler: detector.is_crawler?,
|
34
|
+
crawler_name: detector.crawler_name
|
35
|
+
}
|
36
|
+
end
|
34
37
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
38
|
+
def user_agent(env)
|
39
|
+
user_agent_headers.map do |header|
|
40
|
+
env[header]
|
41
|
+
end.compact.join(" ")
|
42
|
+
end
|
40
43
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
+
def user_agent_headers
|
45
|
+
::CrawlerDetect::Library.get_array("headers")
|
46
|
+
end
|
44
47
|
end
|
45
48
|
end
|
metadata
CHANGED
@@ -1,29 +1,57 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawler_detect
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Pavel Kozlov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-06-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: oj
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '3.0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '3.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: qonfig
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.24'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0.24'
|
13
41
|
- !ruby/object:Gem::Dependency
|
14
42
|
name: activesupport
|
15
43
|
requirement: !ruby/object:Gem::Requirement
|
16
44
|
requirements:
|
17
45
|
- - "~>"
|
18
46
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
47
|
+
version: 6.0.3
|
20
48
|
type: :development
|
21
49
|
prerelease: false
|
22
50
|
version_requirements: !ruby/object:Gem::Requirement
|
23
51
|
requirements:
|
24
52
|
- - "~>"
|
25
53
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
54
|
+
version: 6.0.3
|
27
55
|
- !ruby/object:Gem::Dependency
|
28
56
|
name: bundler
|
29
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,6 +150,20 @@ dependencies:
|
|
122
150
|
- - "~>"
|
123
151
|
- !ruby/object:Gem::Version
|
124
152
|
version: '3.0'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: armitage-rubocop
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - '='
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0.82'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - '='
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '0.82'
|
125
167
|
description: CrawlerDetect is a library to detect bots/crawlers via the user agent
|
126
168
|
email:
|
127
169
|
- loadkpi@gmail.com
|
@@ -133,17 +175,25 @@ files:
|
|
133
175
|
- ".rspec"
|
134
176
|
- ".rubocop.yml"
|
135
177
|
- ".travis.yml"
|
178
|
+
- CHANGELOG.md
|
136
179
|
- Gemfile
|
180
|
+
- Gemfile.lock
|
137
181
|
- LICENSE.txt
|
138
182
|
- README.md
|
139
183
|
- Rakefile
|
184
|
+
- bin/update_raw_files
|
140
185
|
- crawler_detect.gemspec
|
141
186
|
- lib/crawler_detect.rb
|
187
|
+
- lib/crawler_detect/config.rb
|
142
188
|
- lib/crawler_detect/detector.rb
|
143
189
|
- lib/crawler_detect/library.rb
|
144
190
|
- lib/crawler_detect/library/crawlers.rb
|
145
191
|
- lib/crawler_detect/library/exclusions.rb
|
146
192
|
- lib/crawler_detect/library/headers.rb
|
193
|
+
- lib/crawler_detect/library/loader.rb
|
194
|
+
- lib/crawler_detect/library/raw/Crawlers.json
|
195
|
+
- lib/crawler_detect/library/raw/Exclusions.json
|
196
|
+
- lib/crawler_detect/library/raw/Headers.json
|
147
197
|
- lib/crawler_detect/version.rb
|
148
198
|
- lib/rack/crawler_detect.rb
|
149
199
|
homepage: https://github.com/loadkpi/crawler_detect
|
@@ -165,8 +215,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
165
215
|
- !ruby/object:Gem::Version
|
166
216
|
version: '0'
|
167
217
|
requirements: []
|
168
|
-
|
169
|
-
rubygems_version: 2.7.6
|
218
|
+
rubygems_version: 3.1.2
|
170
219
|
signing_key:
|
171
220
|
specification_version: 4
|
172
221
|
summary: 'CrawlerDetect: detect bots/crawlers'
|