domains_scanner 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ea761f85e758619f9d262dc93c6d28ed3b528788
4
- data.tar.gz: 9cf162281540446d281a98ec9f7b596215fdba47
3
+ metadata.gz: 3feae4927a04afcb710018b1823d23b7f0144137
4
+ data.tar.gz: c4299771c9de14510a584c2f3007fa4497249254
5
5
  SHA512:
6
- metadata.gz: 9a39d231cd4afaf155659605ee0f04faa920f52d2b1b4c0af5fb545b0e5fcb634b58911a0a01986d5ebbf6430a261c350b5921c798fe4789566de7c07ebe9729
7
- data.tar.gz: aa736332989a71b16ed09465375d29906eee1ddd1f1219b1582e3cd3a1e2e26e39c2ab44de2b3d58cecee064e1447e10791cb121b02a83cc3f933d95c9ebbc13
6
+ metadata.gz: cc7889eed1197a502b31dcd5d58041eac9d746fa88f584bb543ceb9f124d919eaa1440efb2346e1b6271fdf34f280154f2a4729a765736f5b0c5fe380f1df15c
7
+ data.tar.gz: 9f53a04ffc8ad5d2caba6216d789e5f9c0757c32ecfd612283c545585499598c27c576645ec8c4a71179a06795242ec57a5f415e4efa70ab066ef25dac02f77c
data/.gitignore CHANGED
@@ -3,7 +3,6 @@
3
3
  /Gemfile.lock
4
4
  /_yardoc/
5
5
  /coverage/
6
- /doc/
7
6
  /pkg/
8
7
  /spec/reports/
9
8
  /tmp/
data/README.md CHANGED
@@ -1,8 +1,8 @@
1
1
  # DomainsScanner
2
2
 
3
- Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/domains_scanner`. To experiment with that code, run `bin/console` for an interactive prompt.
3
+ Search possible sub domains according to specified domain. DomainsScanner default use `site:*.{domain}.{top_level_domain}` search syntax to search sites available from baidu and google searchs.
4
4
 
5
- TODO: Delete this and the text above, and describe your gem
5
+ ![Preview Screenshot](./doc/images/preview.png)
6
6
 
7
7
  ## Installation
8
8
 
@@ -22,11 +22,22 @@ Or install it yourself as:
22
22
 
23
23
  ## Usage
24
24
 
25
- TODO: Write usage instructions here
25
+ ```sh
26
+ $ domains_scanner -h
27
+ Usage: domains_scanner [options] domain_name
26
28
 
27
- ## Development
29
+ Specific options:
30
+ -v, --[no-]verbose Run verbosely, default: false
31
+ --top-domains= search top level domains, split by comma, default: ["com", "cn", "com.cn", "net", "org", "ltd", "cc", "mobi", "live", "io", "co", "me", "hk"]
32
+ -e, --engines= search engines, split by comma, default: [google, baidu]
33
+ --max-page= Maximum number of pages to scan, default: 20
34
+
35
+ Common options:
36
+ -h, --help Show this message
37
+ --version Show version
38
+ ```
28
39
 
29
- After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
40
+ ## Development
30
41
 
31
42
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
43
 
Binary file
@@ -1,16 +1,12 @@
1
1
  module DomainsScanner
2
2
  module Crawlers
3
3
  class Baidu < Base
4
- def search(domain_name, top_level_domain, page = 1)
5
- set_user_agent
6
- query = search_keyword(domain_name, top_level_domain)
7
- start = (page - 1) * 10
8
- doc = agent.get("https://www.baidu.com/s?wd=#{query}&pn=#{start}")
9
-
10
- results = parse_results(doc)
11
- have_next_page = have_next_page?(doc)
4
+ def host
5
+ "https://www.baidu.com"
6
+ end
12
7
 
13
- DomainsScanner::Results.new(results, have_next_page)
8
+ def keyword_field_name
9
+ "wd"
14
10
  end
15
11
 
16
12
  # [{title: "xxx", url: "xxx"}, ...]
@@ -33,8 +29,8 @@ module DomainsScanner
33
29
  end
34
30
  end
35
31
 
36
- def have_next_page?(doc)
37
- doc.search("#page strong+a").any?
32
+ def next_page_link_selector
33
+ "#page strong+a"
38
34
  end
39
35
  end
40
36
  end
@@ -7,23 +7,55 @@ module DomainsScanner
7
7
  end
8
8
 
9
9
  def agent
10
- @agent ||= Mechanize.new
10
+ @agent ||= Mechanize.new do |agent|
11
+ agent.user_agent_alias = "Mac Safari"
12
+ end
11
13
  end
12
14
 
13
- def set_user_agent
14
- agent.user_agent_alias = available_agent_alias.sample
15
+ def search_by_form(domain_name, top_level_domain)
16
+ doc = agent.get(host)
17
+
18
+ form = doc.forms.first
19
+ query = search_keyword(domain_name, top_level_domain)
20
+ form[keyword_field_name] = query
21
+ doc = form.submit
22
+
23
+ results = parse_results(doc)
24
+ next_page_link = parse_next_page_link(doc)
25
+
26
+ DomainsScanner::Results.new(results, next_page_link)
15
27
  end
16
28
 
17
- def available_agent_alias
18
- @available_agent_alias ||= Mechanize::AGENT_ALIASES.keys - ['Mechanize']
29
+ def search_by_link(link)
30
+ doc = agent.get(link)
31
+ results = parse_results(doc)
32
+ next_page_link = parse_next_page_link(doc)
33
+
34
+ DomainsScanner::Results.new(results, next_page_link)
35
+ end
36
+
37
+ def parse_next_page_link(doc)
38
+ next_page_tag = doc.search(next_page_link_selector).first
39
+ return unless next_page_tag
40
+
41
+ href = next_page_tag.attributes["href"]
42
+ "#{host}#{href}"
19
43
  end
20
44
 
21
45
  def search_keyword(domain_name, top_level_domain)
22
46
  "site:*.#{domain_name}.#{top_level_domain}"
23
47
  end
24
48
 
25
- def search(domain_name, top_level_domain, page = 1)
26
- raise NotImplementedError, "#{self.class.name}#search need to be implmented in sub class"
49
+ def keyword_field_name
50
+ raise NotImplementedError
51
+ end
52
+
53
+ def parse_results(doc)
54
+ raise NotImplementedError
55
+ end
56
+
57
+ def have_next_page?(doc)
58
+ raise NotImplementedError
27
59
  end
28
60
  end
29
61
  end
@@ -1,16 +1,12 @@
1
1
  module DomainsScanner
2
2
  module Crawlers
3
3
  class Google < Base
4
- def search(domain_name, top_level_domain, page = 1)
5
- set_user_agent
6
- query = search_keyword(domain_name, top_level_domain)
7
- start = (page - 1) * 10
8
- doc = agent.get("https://google.com/search?q=#{query}&start=#{start}")
9
-
10
- results = parse_results(doc)
11
- have_next_page = have_next_page?(doc)
4
+ def host
5
+ "https://google.com"
6
+ end
12
7
 
13
- DomainsScanner::Results.new(results, have_next_page)
8
+ def keyword_field_name
9
+ "q"
14
10
  end
15
11
 
16
12
  # [{title: "xxx", url: "xxx"}, ...]
@@ -28,8 +24,8 @@ module DomainsScanner
28
24
  end
29
25
  end
30
26
 
31
- def have_next_page?(doc)
32
- doc.search("div#foot .cur+td").any?
27
+ def next_page_link_selector
28
+ "div#foot .cur+td>a"
33
29
  end
34
30
  end
35
31
  end
@@ -2,13 +2,13 @@ require 'domains_scanner/result_item'
2
2
 
3
3
  module DomainsScanner
4
4
  class Results
5
- attr_reader :items, :have_next_page
5
+ attr_reader :items, :next_page_link
6
6
 
7
- def initialize(results, have_next_page)
7
+ def initialize(results, next_page_link)
8
8
  @items = results.map do |result|
9
9
  ResultItem.new(title: result[:title], url: result[:url])
10
10
  end
11
- @have_next_page = have_next_page
11
+ @next_page_link = next_page_link
12
12
  end
13
13
  end
14
14
  end
@@ -13,24 +13,32 @@ module DomainsScanner
13
13
  @workers = DomainsScanner.engines.map do |engine|
14
14
  crawler = DomainsScanner::Crawlers.build(engine)
15
15
  page = 1
16
+ next_page_link = nil
16
17
 
17
18
  Thread.new do
18
19
  loop do
19
20
  puts "Scanning #{domain} with #{engine} on page: #{page}" if DomainsScanner.verbose
20
21
 
21
22
  begin
22
- results = crawler.search(@domain_word, @top_level_domain, page)
23
+ if page == 1
24
+ puts "Search by form>>>>" if DomainsScanner.verbose
25
+ results = crawler.search_by_form(@domain_word, @top_level_domain)
26
+ else
27
+ puts "Search by link: #{next_page_link}>>>>" if DomainsScanner.verbose
28
+ results = crawler.search_by_link(next_page_link)
29
+ end
30
+ next_page_link = results.next_page_link
31
+
23
32
  results.items.each do |item|
24
- DomainsScanner.output_queue.push({
25
- domain: item.host, top_level_domain: @top_level_domain, engine: engine
26
- })
33
+ DomainsScanner.output_queue.push({
34
+ domain: item.host, top_level_domain: @top_level_domain, engine: engine
35
+ })
27
36
  end
28
- break unless results.have_next_page
29
37
  rescue Mechanize::ResponseCodeError => e
30
38
  puts "search in #{engine} error, skip now" if DomainsScanner.verbose
31
39
  end
32
40
 
33
- break unless page < DomainsScanner.max_page
41
+ break unless next_page_link && page < DomainsScanner.max_page
34
42
  page += 1
35
43
  end
36
44
  end
@@ -1,3 +1,3 @@
1
1
  module DomainsScanner
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: domains_scanner
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martin Hong
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-11-25 00:00:00.000000000 Z
11
+ date: 2017-12-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -86,6 +86,7 @@ files:
86
86
  - README.md
87
87
  - Rakefile
88
88
  - bin/domains_scanner
89
+ - doc/images/preview.png
89
90
  - domains_scanner.gemspec
90
91
  - lib/ansi_colors.rb
91
92
  - lib/domains_scanner.rb