phishtank_scraper 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/phishtank_scraper/phishing_set.rb +72 -0
- data/lib/phishtank_scraper/site.rb +32 -0
- data/lib/phishtank_scraper.rb +42 -0
- metadata +66 -0
    
        checksums.yaml
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            SHA1:
         | 
| 3 | 
            +
              metadata.gz: 0688aee54e521f9a6a96fd3e48b7f7b53dce9c7d
         | 
| 4 | 
            +
              data.tar.gz: 9c2918a1aee0a431e6d7d941fe0bce6ccc2048f9
         | 
| 5 | 
            +
            SHA512:
         | 
| 6 | 
            +
              metadata.gz: 129c7e9814004ed43b57ec64dfb6481dfa5081414903e3942e3639fa26b7ae885abd1fd754313db27184716a3e48da89e4619fc716b37a4478a5327c7a551666
         | 
| 7 | 
            +
              data.tar.gz: d50292dcb0d1b6e657d1b69743af3c2335631521de0403af93c48825cb60a1fae6205d45a7c766c0d48452ba8e2f5034956583e7bd3a4ad15b0e270e01dc5b97
         | 
| @@ -0,0 +1,72 @@ | |
| 1 | 
            +
            require 'nokogiri'
         | 
| 2 | 
            +
            # require 'random_user_agent'
         | 
| 3 | 
            +
            require 'open-uri'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            # Creates a collection of phishing hashes for each phishtank page
         | 
| 6 | 
            +
            # Example
         | 
| 7 | 
            +
            # {
         | 
| 8 | 
            +
            #   id: "4141251",
         | 
| 9 | 
            +
            #   url: "http://bintango.xyz/AIsaE",
         | 
| 10 | 
            +
            #   created_at: "added on Jun 3rd 2016 3:57 PM",
         | 
| 11 | 
            +
            #   submitter: "PhishReporter",
         | 
| 12 | 
            +
            #   valid: "Unknown",
         | 
| 13 | 
            +
            #   online: "ONLINE"
         | 
| 14 | 
            +
            # }
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            class	PhishingSet
         | 
| 17 | 
            +
              include Enumerable
         | 
| 18 | 
            +
             | 
| 19 | 
            +
              attr_reader :url, :all
         | 
| 20 | 
            +
             | 
| 21 | 
            +
              def initialize(url)
         | 
| 22 | 
            +
                @url = URI(url)
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                @page = Nokogiri::HTML(open(@url.to_s))
         | 
| 25 | 
            +
                @all = scrape_parse
         | 
| 26 | 
            +
              end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
              def scrape_parse
         | 
| 29 | 
            +
                rows = @page.at('.data').search('tr') 
         | 
| 30 | 
            +
                rows.shift #removes header
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                rows.collect do |row|
         | 
| 33 | 
            +
                  id = row.at_xpath('td[1]/a/text()').to_s.strip
         | 
| 34 | 
            +
                  url_id = row.at_xpath('td[1]/a/@href').to_s.strip
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                  url = row.at_xpath('td[2]/text()').to_s.strip
         | 
| 37 | 
            +
                  url = scrape_detail(url_id) if url[-3,3] == "..." # incomplete url
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                  {
         | 
| 40 | 
            +
                    id: id,
         | 
| 41 | 
            +
                    url: url,
         | 
| 42 | 
            +
                    created_at: row.at_xpath('td[2]/span/text()').to_s.strip,
         | 
| 43 | 
            +
                    submitter: row.at_xpath('td[3]/a/text()').to_s.strip,
         | 
| 44 | 
            +
                    valid: row.at_xpath('td[4]/text()').to_s.strip,
         | 
| 45 | 
            +
                    online: row.at_xpath('td[5]/strong/text()').to_s.strip
         | 
| 46 | 
            +
                  }
         | 
| 47 | 
            +
                end
         | 
| 48 | 
            +
              end
         | 
| 49 | 
            +
             | 
| 50 | 
            +
              def scrape_detail(url)
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                uri = URI(url)
         | 
| 53 | 
            +
                uri = uri.host ? uri : "#{@url.scheme}://#{@url.host}/#{uri}"
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                detail_page = Nokogiri::HTML(open(uri))
         | 
| 56 | 
            +
                detail_page.at("#widecol").at_xpath("div/div[3]/b/text()").to_s
         | 
| 57 | 
            +
              end
         | 
| 58 | 
            +
             | 
| 59 | 
            +
              def first
         | 
| 60 | 
            +
                @all.first
         | 
| 61 | 
            +
              end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
              def page_at_id(id)
         | 
| 64 | 
            +
                last_subm_id = self.first[:id].to_i
         | 
| 65 | 
            +
                ((last_subm_id - id + 1)/20).round # 20 items per page
         | 
| 66 | 
            +
              end
         | 
| 67 | 
            +
             | 
| 68 | 
            +
              def each
         | 
| 69 | 
            +
                @all.each{ |ph| yield ph }
         | 
| 70 | 
            +
              end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
            end
         | 
| @@ -0,0 +1,32 @@ | |
| 1 | 
            +
            # Prvides routes and URLs
         | 
| 2 | 
            +
            class Site
         | 
| 3 | 
            +
              attr_reader :domain
         | 
| 4 | 
            +
             | 
| 5 | 
            +
              def initialize(url)
         | 
| 6 | 
            +
                @domain = url
         | 
| 7 | 
            +
              end
         | 
| 8 | 
            +
             | 
| 9 | 
            +
              def home
         | 
| 10 | 
            +
                build_path
         | 
| 11 | 
            +
              end
         | 
| 12 | 
            +
              
         | 
| 13 | 
            +
              def build_path(page_index=0, options={})
         | 
| 14 | 
            +
                active = options[:active] 
         | 
| 15 | 
            +
                valid = options[:valid] 
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                path = if active or valid
         | 
| 18 | 
            +
                  actives =  "&active=" + (active || "y")
         | 
| 19 | 
            +
                  valid =  "&valid=" + (valid || "y")
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                  "phish_search.php?page=#{page_index}#{active}#{valid}&Search=Search"
         | 
| 22 | 
            +
                else
         | 
| 23 | 
            +
                  "phish_archive.php?page=#{page_index}"
         | 
| 24 | 
            +
                end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                "#{@domain}/#{path}"
         | 
| 27 | 
            +
              end
         | 
| 28 | 
            +
             
         | 
| 29 | 
            +
              def build_detail_path(submission_id)
         | 
| 30 | 
            +
                "#{@domain}/phish_detail.php?phish_id=#{submission_id}"
         | 
| 31 | 
            +
              end
         | 
| 32 | 
            +
            end
         | 
| @@ -0,0 +1,42 @@ | |
| 1 | 
            +
            require 'phishtank_scraper/phishing_set'
         | 
| 2 | 
            +
            require 'phishtank_scraper/site'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            # Director interface for scraping
         | 
| 5 | 
            +
            class PhishtankScraper
         | 
| 6 | 
            +
              attr_reader :site, :range
         | 
| 7 | 
            +
             | 
| 8 | 
            +
              def initialize(url="http://phishtank.com")
         | 
| 9 | 
            +
                @site = Site.new(url)
         | 
| 10 | 
            +
                @range = (0..0)
         | 
| 11 | 
            +
              end
         | 
| 12 | 
            +
             | 
| 13 | 
            +
              # returns an array of detections in the pages range 
         | 
| 14 | 
            +
              # options: 
         | 
| 15 | 
            +
              # active: "All", "n", "y", "u"
         | 
| 16 | 
            +
              # valid: "All", "n", "y", "u"
         | 
| 17 | 
            +
              def page_scrape(range=@range, options={})
         | 
| 18 | 
            +
                build_range(range).map do |page_index|
         | 
| 19 | 
            +
                  PhishingSet.new(@site.build_path(page_index, options)).all
         | 
| 20 | 
            +
                end.flatten
         | 
| 21 | 
            +
              end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
              # returns an array of detections from id to last submitted id
         | 
| 24 | 
            +
              # options: 
         | 
| 25 | 
            +
              # active: "All", "n", "y", "u"
         | 
| 26 | 
            +
              # valid: "All", "n", "y", "u"
         | 
| 27 | 
            +
              def id_scrape(since, options={})
         | 
| 28 | 
            +
                since = since.to_i
         | 
| 29 | 
            +
                page_at = PhishingSet.new(@site.home).page_at_id(since)
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                phset = (0..page_at).map do |page_index|
         | 
| 32 | 
            +
                  PhishingSet.new(@site.build_path(page_index, options)).all
         | 
| 33 | 
            +
                end.flatten
         | 
| 34 | 
            +
                
         | 
| 35 | 
            +
                phset.delete_if {|ph| ph[:id].to_i < since}
         | 
| 36 | 
            +
              end
         | 
| 37 | 
            +
             | 
| 38 | 
            +
              private
         | 
| 39 | 
            +
              def build_range(value)
         | 
| 40 | 
            +
                @range = value.class.eql?(Range) ? value : (value..value)
         | 
| 41 | 
            +
              end 
         | 
| 42 | 
            +
            end
         | 
    
        metadata
    ADDED
    
    | @@ -0,0 +1,66 @@ | |
| 1 | 
            +
            --- !ruby/object:Gem::Specification
         | 
| 2 | 
            +
            name: phishtank_scraper
         | 
| 3 | 
            +
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            +
              version: 0.0.1
         | 
| 5 | 
            +
            platform: ruby
         | 
| 6 | 
            +
            authors:
         | 
| 7 | 
            +
            - Marlon Méndez
         | 
| 8 | 
            +
            autorequire: 
         | 
| 9 | 
            +
            bindir: bin
         | 
| 10 | 
            +
            cert_chain: []
         | 
| 11 | 
            +
            date: 2016-06-03 00:00:00.000000000 Z
         | 
| 12 | 
            +
            dependencies:
         | 
| 13 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 14 | 
            +
              name: nokogiri
         | 
| 15 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 16 | 
            +
                requirements:
         | 
| 17 | 
            +
                - - "~>"
         | 
| 18 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 19 | 
            +
                    version: '1.6'
         | 
| 20 | 
            +
                - - ">="
         | 
| 21 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 22 | 
            +
                    version: 1.6.7.2
         | 
| 23 | 
            +
              type: :runtime
         | 
| 24 | 
            +
              prerelease: false
         | 
| 25 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 26 | 
            +
                requirements:
         | 
| 27 | 
            +
                - - "~>"
         | 
| 28 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 29 | 
            +
                    version: '1.6'
         | 
| 30 | 
            +
                - - ">="
         | 
| 31 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 32 | 
            +
                    version: 1.6.7.2
         | 
| 33 | 
            +
            description: Scrapes Phishtank submissions given a page interval or an id
         | 
| 34 | 
            +
            email: marlonmendezg@gmail.com
         | 
| 35 | 
            +
            executables: []
         | 
| 36 | 
            +
            extensions: []
         | 
| 37 | 
            +
            extra_rdoc_files: []
         | 
| 38 | 
            +
            files:
         | 
| 39 | 
            +
            - lib/phishtank_scraper.rb
         | 
| 40 | 
            +
            - lib/phishtank_scraper/phishing_set.rb
         | 
| 41 | 
            +
            - lib/phishtank_scraper/site.rb
         | 
| 42 | 
            +
            homepage: http://rubygems.org/gems/phishtank_scraper
         | 
| 43 | 
            +
            licenses:
         | 
| 44 | 
            +
            - MIT
         | 
| 45 | 
            +
            metadata: {}
         | 
| 46 | 
            +
            post_install_message: 
         | 
| 47 | 
            +
            rdoc_options: []
         | 
| 48 | 
            +
            require_paths:
         | 
| 49 | 
            +
            - lib
         | 
| 50 | 
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         | 
| 51 | 
            +
              requirements:
         | 
| 52 | 
            +
              - - ">="
         | 
| 53 | 
            +
                - !ruby/object:Gem::Version
         | 
| 54 | 
            +
                  version: '0'
         | 
| 55 | 
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 56 | 
            +
              requirements:
         | 
| 57 | 
            +
              - - ">="
         | 
| 58 | 
            +
                - !ruby/object:Gem::Version
         | 
| 59 | 
            +
                  version: '0'
         | 
| 60 | 
            +
            requirements: []
         | 
| 61 | 
            +
            rubyforge_project: 
         | 
| 62 | 
            +
            rubygems_version: 2.4.5
         | 
| 63 | 
            +
            signing_key: 
         | 
| 64 | 
            +
            specification_version: 4
         | 
| 65 | 
            +
            summary: Scrapes Phishtank submissions
         | 
| 66 | 
            +
            test_files: []
         |