tansaku 0.3.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/tansaku/cli.rb +1 -1
- data/lib/tansaku/crawler.rb +34 -25
- data/lib/tansaku/monkey_patch.rb +22 -0
- data/lib/tansaku/version.rb +1 -1
- data/tansaku.gemspec +2 -1
- metadata +20 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a179d2d95fc0e6c78d908ff8379ecefb609448f38284f77c21fcd92dd8a0b7ca
|
4
|
+
data.tar.gz: 0eaecd595420390290a4e5b9f8a759b229e096037e60d058d58fd6af02dfe9f0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a4b252ff9c13cbe70109f8a15b77e9923672f15c5de2f93f26af6e6a0031ebdf0ed2c246e76968843e66f6e4eda0cdeae067b3886c8525237f5a412032780b63
|
7
|
+
data.tar.gz: '01952d73467ad3fafce9b3726675c7f145f7db359052cb6682e533f7faa2c42022f8e2f38e07a3f63fe132ee36d8a8dfcbfdc61563c84d9b55d3a8536b1b3146'
|
data/README.md
CHANGED
@@ -37,7 +37,7 @@ Usage:
|
|
37
37
|
Options:
|
38
38
|
[--additional-list=ADDITIONAL_LIST] # Path to the file which includes additonal paths to crawl
|
39
39
|
[--host=HOST] # Host header to use
|
40
|
-
[--
|
40
|
+
[--max-concurrent-requests=N] # Number of concurrent requests to use
|
41
41
|
[--type=TYPE] # Type of a list to crawl (admin, backup, database, etc, log or all)
|
42
42
|
# Default: all
|
43
43
|
[--user-agent=USER_AGENT] # User-Agent header to use
|
data/lib/tansaku/cli.rb
CHANGED
@@ -9,7 +9,7 @@ module Tansaku
|
|
9
9
|
desc "crawl URL", "Crawl a given URL"
|
10
10
|
method_option :additional_list, desc: "Path to the file which includes additonal paths to crawl"
|
11
11
|
method_option :host, type: :string, desc: "Host header to use"
|
12
|
-
method_option :
|
12
|
+
method_option :max_concurrent_requests, type: :numeric, desc: "Number of concurrent requests to use"
|
13
13
|
method_option :type, desc: "Type of a list to crawl (admin, backup, database, etc, log or all)", default: "all"
|
14
14
|
method_option :user_agent, type: :string, desc: "User-Agent header to use"
|
15
15
|
def crawl(url)
|
data/lib/tansaku/crawler.rb
CHANGED
@@ -1,10 +1,15 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require "async/http/internet"
|
4
|
+
require "async"
|
5
|
+
require "async/barrier"
|
6
|
+
require "async/semaphore"
|
3
7
|
require "cgi"
|
4
|
-
require "
|
5
|
-
require "parallel"
|
8
|
+
require "etc"
|
6
9
|
require "uri"
|
7
10
|
|
11
|
+
require "tansaku/monkey_patch"
|
12
|
+
|
8
13
|
module Tansaku
|
9
14
|
class Crawler
|
10
15
|
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
|
@@ -13,7 +18,7 @@ module Tansaku
|
|
13
18
|
|
14
19
|
attr_reader :additional_list
|
15
20
|
attr_reader :host
|
16
|
-
attr_reader :
|
21
|
+
attr_reader :max_concurrent_requests
|
17
22
|
attr_reader :type
|
18
23
|
attr_reader :user_agent
|
19
24
|
|
@@ -21,7 +26,7 @@ module Tansaku
|
|
21
26
|
base_uri,
|
22
27
|
additional_list: nil,
|
23
28
|
host: nil,
|
24
|
-
|
29
|
+
max_concurrent_requests: Etc.nprocessors,
|
25
30
|
type: "all",
|
26
31
|
user_agent: DEFAULT_USER_AGENT
|
27
32
|
)
|
@@ -34,27 +39,39 @@ module Tansaku
|
|
34
39
|
end
|
35
40
|
|
36
41
|
@host = host
|
37
|
-
@
|
42
|
+
@max_concurrent_requests = max_concurrent_requests
|
38
43
|
@type = type
|
39
44
|
@user_agent = user_agent
|
40
45
|
end
|
41
46
|
|
42
|
-
def online?(url)
|
43
|
-
res = head(url)
|
44
|
-
[200, 401, 302].include? res.code.to_i
|
45
|
-
end
|
46
|
-
|
47
47
|
def crawl
|
48
|
-
results =
|
49
|
-
|
50
|
-
|
51
|
-
|
48
|
+
results = {}
|
49
|
+
Async do
|
50
|
+
barrier = Async::Barrier.new
|
51
|
+
semaphore = Async::Semaphore.new(max_concurrent_requests, parent: barrier)
|
52
|
+
internet = Async::HTTP::Internet.new
|
53
|
+
|
54
|
+
paths.each do |path|
|
55
|
+
semaphore.async do
|
56
|
+
url = url_for(path)
|
57
|
+
res = internet.head(url, default_request_headers)
|
58
|
+
|
59
|
+
results[url] = res.status if online?(res.status)
|
60
|
+
rescue Errno::ECONNRESET, Errno::ECONNREFUSED, Errno::EHOSTUNREACH, EOFError, OpenSSL::SSL::SSLError, Async::TimeoutError
|
61
|
+
next
|
62
|
+
end
|
63
|
+
end
|
64
|
+
barrier.wait
|
52
65
|
end
|
53
|
-
results
|
66
|
+
results
|
54
67
|
end
|
55
68
|
|
56
69
|
private
|
57
70
|
|
71
|
+
def online?(status)
|
72
|
+
[200, 204, 301, 302, 307, 401, 403].include? status.to_i
|
73
|
+
end
|
74
|
+
|
58
75
|
def valid_uri?
|
59
76
|
["http", "https"].include? base_uri.scheme
|
60
77
|
end
|
@@ -77,16 +94,8 @@ module Tansaku
|
|
77
94
|
paths.map { |path| url_for path }
|
78
95
|
end
|
79
96
|
|
80
|
-
def
|
81
|
-
|
82
|
-
end
|
83
|
-
|
84
|
-
def head(url)
|
85
|
-
head = Net::HTTP::Head.new(url)
|
86
|
-
head["User-Agent"] = user_agent
|
87
|
-
head["Host"] = host unless host.nil?
|
88
|
-
|
89
|
-
request(head)
|
97
|
+
def default_request_headers
|
98
|
+
@default_request_headers ||= { "host" => host, "user-agent" => user_agent }.compact
|
90
99
|
end
|
91
100
|
end
|
92
101
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "protocol/http1/connection"
|
4
|
+
|
5
|
+
module Protocol
|
6
|
+
module HTTP1
|
7
|
+
class Connection
|
8
|
+
def write_request(authority, method, path, version, headers)
|
9
|
+
host = authority
|
10
|
+
if headers.include?("host")
|
11
|
+
host = headers["host"]
|
12
|
+
headers.delete "host"
|
13
|
+
end
|
14
|
+
|
15
|
+
@stream.write("#{method} #{path} #{version}\r\n")
|
16
|
+
@stream.write("host: #{host}\r\n")
|
17
|
+
|
18
|
+
write_headers(headers)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/tansaku/version.rb
CHANGED
data/tansaku.gemspec
CHANGED
@@ -31,6 +31,7 @@ Gem::Specification.new do |spec|
|
|
31
31
|
spec.add_development_dependency "rspec", "~> 3.9"
|
32
32
|
spec.add_development_dependency "webmock", "~> 3.8"
|
33
33
|
|
34
|
-
spec.add_dependency "
|
34
|
+
spec.add_dependency "async", "~> 1.26"
|
35
|
+
spec.add_dependency "async-http", "~> 0.52"
|
35
36
|
spec.add_dependency "thor", "~> 1.0"
|
36
37
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tansaku
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manabu Niseki
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-07-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -95,19 +95,33 @@ dependencies:
|
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '3.8'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
98
|
+
name: async
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: '1.
|
103
|
+
version: '1.26'
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: '1.
|
110
|
+
version: '1.26'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: async-http
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0.52'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0.52'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: thor
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -149,6 +163,7 @@ files:
|
|
149
163
|
- lib/tansaku/lists/database.txt
|
150
164
|
- lib/tansaku/lists/etc.txt
|
151
165
|
- lib/tansaku/lists/log.txt
|
166
|
+
- lib/tansaku/monkey_patch.rb
|
152
167
|
- lib/tansaku/path.rb
|
153
168
|
- lib/tansaku/version.rb
|
154
169
|
- tansaku.gemspec
|