phishtank_scraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0688aee54e521f9a6a96fd3e48b7f7b53dce9c7d
4
+ data.tar.gz: 9c2918a1aee0a431e6d7d941fe0bce6ccc2048f9
5
+ SHA512:
6
+ metadata.gz: 129c7e9814004ed43b57ec64dfb6481dfa5081414903e3942e3639fa26b7ae885abd1fd754313db27184716a3e48da89e4619fc716b37a4478a5327c7a551666
7
+ data.tar.gz: d50292dcb0d1b6e657d1b69743af3c2335631521de0403af93c48825cb60a1fae6205d45a7c766c0d48452ba8e2f5034956583e7bd3a4ad15b0e270e01dc5b97
@@ -0,0 +1,72 @@
1
+ require 'nokogiri'
2
+ # require 'random_user_agent'
3
+ require 'open-uri'
4
+
5
+ # Creates a collection of phishing hashes for each phishtank page
6
+ # Example
7
+ # {
8
+ # id: "4141251",
9
+ # url: "http://bintango.xyz/AIsaE",
10
+ # created_at: "added on Jun 3rd 2016 3:57 PM",
11
+ # submitter: "PhishReporter",
12
+ # valid: "Unknown",
13
+ # online: "ONLINE"
14
+ # }
15
+
16
+ class PhishingSet
17
+ include Enumerable
18
+
19
+ attr_reader :url, :all
20
+
21
+ def initialize(url)
22
+ @url = URI(url)
23
+
24
+ @page = Nokogiri::HTML(open(@url.to_s))
25
+ @all = scrape_parse
26
+ end
27
+
28
+ def scrape_parse
29
+ rows = @page.at('.data').search('tr')
30
+ rows.shift #removes header
31
+
32
+ rows.collect do |row|
33
+ id = row.at_xpath('td[1]/a/text()').to_s.strip
34
+ url_id = row.at_xpath('td[1]/a/@href').to_s.strip
35
+
36
+ url = row.at_xpath('td[2]/text()').to_s.strip
37
+ url = scrape_detail(url_id) if url[-3,3] == "..." # incomplete url
38
+
39
+ {
40
+ id: id,
41
+ url: url,
42
+ created_at: row.at_xpath('td[2]/span/text()').to_s.strip,
43
+ submitter: row.at_xpath('td[3]/a/text()').to_s.strip,
44
+ valid: row.at_xpath('td[4]/text()').to_s.strip,
45
+ online: row.at_xpath('td[5]/strong/text()').to_s.strip
46
+ }
47
+ end
48
+ end
49
+
50
+ def scrape_detail(url)
51
+
52
+ uri = URI(url)
53
+ uri = uri.host ? uri : "#{@url.scheme}://#{@url.host}/#{uri}"
54
+
55
+ detail_page = Nokogiri::HTML(open(uri))
56
+ detail_page.at("#widecol").at_xpath("div/div[3]/b/text()").to_s
57
+ end
58
+
59
+ def first
60
+ @all.first
61
+ end
62
+
63
+ def page_at_id(id)
64
+ last_subm_id = self.first[:id].to_i
65
+ ((last_subm_id - id + 1)/20).round # 20 items per page
66
+ end
67
+
68
+ def each
69
+ @all.each{ |ph| yield ph }
70
+ end
71
+
72
+ end
@@ -0,0 +1,32 @@
1
+ # Prvides routes and URLs
2
+ class Site
3
+ attr_reader :domain
4
+
5
+ def initialize(url)
6
+ @domain = url
7
+ end
8
+
9
+ def home
10
+ build_path
11
+ end
12
+
13
+ def build_path(page_index=0, options={})
14
+ active = options[:active]
15
+ valid = options[:valid]
16
+
17
+ path = if active or valid
18
+ actives = "&active=" + (active || "y")
19
+ valid = "&valid=" + (valid || "y")
20
+
21
+ "phish_search.php?page=#{page_index}#{active}#{valid}&Search=Search"
22
+ else
23
+ "phish_archive.php?page=#{page_index}"
24
+ end
25
+
26
+ "#{@domain}/#{path}"
27
+ end
28
+
29
+ def build_detail_path(submission_id)
30
+ "#{@domain}/phish_detail.php?phish_id=#{submission_id}"
31
+ end
32
+ end
@@ -0,0 +1,42 @@
1
+ require 'phishtank_scraper/phishing_set'
2
+ require 'phishtank_scraper/site'
3
+
4
+ # Director interface for scraping
5
+ class PhishtankScraper
6
+ attr_reader :site, :range
7
+
8
+ def initialize(url="http://phishtank.com")
9
+ @site = Site.new(url)
10
+ @range = (0..0)
11
+ end
12
+
13
+ # returns an array of detections in the pages range
14
+ # options:
15
+ # active: "All", "n", "y", "u"
16
+ # valid: "All", "n", "y", "u"
17
+ def page_scrape(range=@range, options={})
18
+ build_range(range).map do |page_index|
19
+ PhishingSet.new(@site.build_path(page_index, options)).all
20
+ end.flatten
21
+ end
22
+
23
+ # returns an array of detections from id to last submitted id
24
+ # options:
25
+ # active: "All", "n", "y", "u"
26
+ # valid: "All", "n", "y", "u"
27
+ def id_scrape(since, options={})
28
+ since = since.to_i
29
+ page_at = PhishingSet.new(@site.home).page_at_id(since)
30
+
31
+ phset = (0..page_at).map do |page_index|
32
+ PhishingSet.new(@site.build_path(page_index, options)).all
33
+ end.flatten
34
+
35
+ phset.delete_if {|ph| ph[:id].to_i < since}
36
+ end
37
+
38
+ private
39
+ def build_range(value)
40
+ @range = value.class.eql?(Range) ? value : (value..value)
41
+ end
42
+ end
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: phishtank_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Marlon Méndez
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-06-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.6.7.2
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.6'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.6.7.2
33
+ description: Scrapes Phishtank submissions given a page interval or an id
34
+ email: marlonmendezg@gmail.com
35
+ executables: []
36
+ extensions: []
37
+ extra_rdoc_files: []
38
+ files:
39
+ - lib/phishtank_scraper.rb
40
+ - lib/phishtank_scraper/phishing_set.rb
41
+ - lib/phishtank_scraper/site.rb
42
+ homepage: http://rubygems.org/gems/phishtank_scraper
43
+ licenses:
44
+ - MIT
45
+ metadata: {}
46
+ post_install_message:
47
+ rdoc_options: []
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
60
+ requirements: []
61
+ rubyforge_project:
62
+ rubygems_version: 2.4.5
63
+ signing_key:
64
+ specification_version: 4
65
+ summary: Scrapes Phishtank submissions
66
+ test_files: []