domains_scanner 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -1
- data/README.md +16 -5
- data/doc/images/preview.png +0 -0
- data/lib/domains_scanner/crawlers/baidu.rb +7 -11
- data/lib/domains_scanner/crawlers/base.rb +39 -7
- data/lib/domains_scanner/crawlers/google.rb +7 -11
- data/lib/domains_scanner/results.rb +3 -3
- data/lib/domains_scanner/runner.rb +14 -6
- data/lib/domains_scanner/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3feae4927a04afcb710018b1823d23b7f0144137
|
4
|
+
data.tar.gz: c4299771c9de14510a584c2f3007fa4497249254
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cc7889eed1197a502b31dcd5d58041eac9d746fa88f584bb543ceb9f124d919eaa1440efb2346e1b6271fdf34f280154f2a4729a765736f5b0c5fe380f1df15c
|
7
|
+
data.tar.gz: 9f53a04ffc8ad5d2caba6216d789e5f9c0757c32ecfd612283c545585499598c27c576645ec8c4a71179a06795242ec57a5f415e4efa70ab066ef25dac02f77c
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# DomainsScanner
|
2
2
|
|
3
|
-
|
3
|
+
Search possible sub domains according to specified domain. DomainsScanner default use `site:*.{domain}.{top_level_domain}` search syntax to search sites available from baidu and google searchs.
|
4
4
|
|
5
|
-
|
5
|
+

|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -22,11 +22,22 @@ Or install it yourself as:
|
|
22
22
|
|
23
23
|
## Usage
|
24
24
|
|
25
|
-
|
25
|
+
```sh
|
26
|
+
$ domains_scanner -h
|
27
|
+
Usage: domains_scanner [options] domain_name
|
26
28
|
|
27
|
-
|
29
|
+
Specific options:
|
30
|
+
-v, --[no-]verbose Run verbosely, default: false
|
31
|
+
--top-domains= search top level domains, split by comma, default: ["com", "cn", "com.cn", "net", "org", "ltd", "cc", "mobi", "live", "io", "co", "me", "hk"]
|
32
|
+
-e, --engines= search engines, split by comma, default: [google, baidu]
|
33
|
+
--max-page= Maximum number of pages to scan, default: 20
|
34
|
+
|
35
|
+
Common options:
|
36
|
+
-h, --help Show this message
|
37
|
+
--version Show version
|
38
|
+
```
|
28
39
|
|
29
|
-
|
40
|
+
## Development
|
30
41
|
|
31
42
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
32
43
|
|
Binary file
|
@@ -1,16 +1,12 @@
|
|
1
1
|
module DomainsScanner
|
2
2
|
module Crawlers
|
3
3
|
class Baidu < Base
|
4
|
-
def
|
5
|
-
|
6
|
-
|
7
|
-
start = (page - 1) * 10
|
8
|
-
doc = agent.get("https://www.baidu.com/s?wd=#{query}&pn=#{start}")
|
9
|
-
|
10
|
-
results = parse_results(doc)
|
11
|
-
have_next_page = have_next_page?(doc)
|
4
|
+
def host
|
5
|
+
"https://www.baidu.com"
|
6
|
+
end
|
12
7
|
|
13
|
-
|
8
|
+
def keyword_field_name
|
9
|
+
"wd"
|
14
10
|
end
|
15
11
|
|
16
12
|
# [{title: "xxx", url: "xxx"}, ...]
|
@@ -33,8 +29,8 @@ module DomainsScanner
|
|
33
29
|
end
|
34
30
|
end
|
35
31
|
|
36
|
-
def
|
37
|
-
|
32
|
+
def next_page_link_selector
|
33
|
+
"#page strong+a"
|
38
34
|
end
|
39
35
|
end
|
40
36
|
end
|
@@ -7,23 +7,55 @@ module DomainsScanner
|
|
7
7
|
end
|
8
8
|
|
9
9
|
def agent
|
10
|
-
@agent ||= Mechanize.new
|
10
|
+
@agent ||= Mechanize.new do |agent|
|
11
|
+
agent.user_agent_alias = "Mac Safari"
|
12
|
+
end
|
11
13
|
end
|
12
14
|
|
13
|
-
def
|
14
|
-
|
15
|
+
def search_by_form(domain_name, top_level_domain)
|
16
|
+
doc = agent.get(host)
|
17
|
+
|
18
|
+
form = doc.forms.first
|
19
|
+
query = search_keyword(domain_name, top_level_domain)
|
20
|
+
form[keyword_field_name] = query
|
21
|
+
doc = form.submit
|
22
|
+
|
23
|
+
results = parse_results(doc)
|
24
|
+
next_page_link = parse_next_page_link(doc)
|
25
|
+
|
26
|
+
DomainsScanner::Results.new(results, next_page_link)
|
15
27
|
end
|
16
28
|
|
17
|
-
def
|
18
|
-
|
29
|
+
def search_by_link(link)
|
30
|
+
doc = agent.get(link)
|
31
|
+
results = parse_results(doc)
|
32
|
+
next_page_link = parse_next_page_link(doc)
|
33
|
+
|
34
|
+
DomainsScanner::Results.new(results, next_page_link)
|
35
|
+
end
|
36
|
+
|
37
|
+
def parse_next_page_link(doc)
|
38
|
+
next_page_tag = doc.search(next_page_link_selector).first
|
39
|
+
return unless next_page_tag
|
40
|
+
|
41
|
+
href = next_page_tag.attributes["href"]
|
42
|
+
"#{host}#{href}"
|
19
43
|
end
|
20
44
|
|
21
45
|
def search_keyword(domain_name, top_level_domain)
|
22
46
|
"site:*.#{domain_name}.#{top_level_domain}"
|
23
47
|
end
|
24
48
|
|
25
|
-
def
|
26
|
-
raise NotImplementedError
|
49
|
+
def keyword_field_name
|
50
|
+
raise NotImplementedError
|
51
|
+
end
|
52
|
+
|
53
|
+
def parse_results(doc)
|
54
|
+
raise NotImplementedError
|
55
|
+
end
|
56
|
+
|
57
|
+
def have_next_page?(doc)
|
58
|
+
raise NotImplementedError
|
27
59
|
end
|
28
60
|
end
|
29
61
|
end
|
@@ -1,16 +1,12 @@
|
|
1
1
|
module DomainsScanner
|
2
2
|
module Crawlers
|
3
3
|
class Google < Base
|
4
|
-
def
|
5
|
-
|
6
|
-
|
7
|
-
start = (page - 1) * 10
|
8
|
-
doc = agent.get("https://google.com/search?q=#{query}&start=#{start}")
|
9
|
-
|
10
|
-
results = parse_results(doc)
|
11
|
-
have_next_page = have_next_page?(doc)
|
4
|
+
def host
|
5
|
+
"https://google.com"
|
6
|
+
end
|
12
7
|
|
13
|
-
|
8
|
+
def keyword_field_name
|
9
|
+
"q"
|
14
10
|
end
|
15
11
|
|
16
12
|
# [{title: "xxx", url: "xxx"}, ...]
|
@@ -28,8 +24,8 @@ module DomainsScanner
|
|
28
24
|
end
|
29
25
|
end
|
30
26
|
|
31
|
-
def
|
32
|
-
|
27
|
+
def next_page_link_selector
|
28
|
+
"div#foot .cur+td>a"
|
33
29
|
end
|
34
30
|
end
|
35
31
|
end
|
@@ -2,13 +2,13 @@ require 'domains_scanner/result_item'
|
|
2
2
|
|
3
3
|
module DomainsScanner
|
4
4
|
class Results
|
5
|
-
attr_reader :items, :
|
5
|
+
attr_reader :items, :next_page_link
|
6
6
|
|
7
|
-
def initialize(results,
|
7
|
+
def initialize(results, next_page_link)
|
8
8
|
@items = results.map do |result|
|
9
9
|
ResultItem.new(title: result[:title], url: result[:url])
|
10
10
|
end
|
11
|
-
@
|
11
|
+
@next_page_link = next_page_link
|
12
12
|
end
|
13
13
|
end
|
14
14
|
end
|
@@ -13,24 +13,32 @@ module DomainsScanner
|
|
13
13
|
@workers = DomainsScanner.engines.map do |engine|
|
14
14
|
crawler = DomainsScanner::Crawlers.build(engine)
|
15
15
|
page = 1
|
16
|
+
next_page_link = nil
|
16
17
|
|
17
18
|
Thread.new do
|
18
19
|
loop do
|
19
20
|
puts "Scanning #{domain} with #{engine} on page: #{page}" if DomainsScanner.verbose
|
20
21
|
|
21
22
|
begin
|
22
|
-
|
23
|
+
if page == 1
|
24
|
+
puts "Search by form>>>>" if DomainsScanner.verbose
|
25
|
+
results = crawler.search_by_form(@domain_word, @top_level_domain)
|
26
|
+
else
|
27
|
+
puts "Search by link: #{next_page_link}>>>>" if DomainsScanner.verbose
|
28
|
+
results = crawler.search_by_link(next_page_link)
|
29
|
+
end
|
30
|
+
next_page_link = results.next_page_link
|
31
|
+
|
23
32
|
results.items.each do |item|
|
24
|
-
|
25
|
-
|
26
|
-
|
33
|
+
DomainsScanner.output_queue.push({
|
34
|
+
domain: item.host, top_level_domain: @top_level_domain, engine: engine
|
35
|
+
})
|
27
36
|
end
|
28
|
-
break unless results.have_next_page
|
29
37
|
rescue Mechanize::ResponseCodeError => e
|
30
38
|
puts "search in #{engine} error, skip now" if DomainsScanner.verbose
|
31
39
|
end
|
32
40
|
|
33
|
-
break unless page < DomainsScanner.max_page
|
41
|
+
break unless next_page_link && page < DomainsScanner.max_page
|
34
42
|
page += 1
|
35
43
|
end
|
36
44
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: domains_scanner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Martin Hong
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-12-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -86,6 +86,7 @@ files:
|
|
86
86
|
- README.md
|
87
87
|
- Rakefile
|
88
88
|
- bin/domains_scanner
|
89
|
+
- doc/images/preview.png
|
89
90
|
- domains_scanner.gemspec
|
90
91
|
- lib/ansi_colors.rb
|
91
92
|
- lib/domains_scanner.rb
|