tansaku 0.3.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/tansaku/cli.rb +1 -1
- data/lib/tansaku/crawler.rb +34 -25
- data/lib/tansaku/monkey_patch.rb +22 -0
- data/lib/tansaku/version.rb +1 -1
- data/tansaku.gemspec +2 -1
- metadata +20 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a179d2d95fc0e6c78d908ff8379ecefb609448f38284f77c21fcd92dd8a0b7ca
|
4
|
+
data.tar.gz: 0eaecd595420390290a4e5b9f8a759b229e096037e60d058d58fd6af02dfe9f0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a4b252ff9c13cbe70109f8a15b77e9923672f15c5de2f93f26af6e6a0031ebdf0ed2c246e76968843e66f6e4eda0cdeae067b3886c8525237f5a412032780b63
|
7
|
+
data.tar.gz: '01952d73467ad3fafce9b3726675c7f145f7db359052cb6682e533f7faa2c42022f8e2f38e07a3f63fe132ee36d8a8dfcbfdc61563c84d9b55d3a8536b1b3146'
|
data/README.md
CHANGED
@@ -37,7 +37,7 @@ Usage:
|
|
37
37
|
Options:
|
38
38
|
[--additional-list=ADDITIONAL_LIST] # Path to the file which includes additonal paths to crawl
|
39
39
|
[--host=HOST] # Host header to use
|
40
|
-
[--
|
40
|
+
[--max-concurrent-requests=N] # Number of concurrent requests to use
|
41
41
|
[--type=TYPE] # Type of a list to crawl (admin, backup, database, etc, log or all)
|
42
42
|
# Default: all
|
43
43
|
[--user-agent=USER_AGENT] # User-Agent header to use
|
data/lib/tansaku/cli.rb
CHANGED
@@ -9,7 +9,7 @@ module Tansaku
|
|
9
9
|
desc "crawl URL", "Crawl a given URL"
|
10
10
|
method_option :additional_list, desc: "Path to the file which includes additonal paths to crawl"
|
11
11
|
method_option :host, type: :string, desc: "Host header to use"
|
12
|
-
method_option :
|
12
|
+
method_option :max_concurrent_requests, type: :numeric, desc: "Number of concurrent requests to use"
|
13
13
|
method_option :type, desc: "Type of a list to crawl (admin, backup, database, etc, log or all)", default: "all"
|
14
14
|
method_option :user_agent, type: :string, desc: "User-Agent header to use"
|
15
15
|
def crawl(url)
|
data/lib/tansaku/crawler.rb
CHANGED
@@ -1,10 +1,15 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require "async/http/internet"
|
4
|
+
require "async"
|
5
|
+
require "async/barrier"
|
6
|
+
require "async/semaphore"
|
3
7
|
require "cgi"
|
4
|
-
require "
|
5
|
-
require "parallel"
|
8
|
+
require "etc"
|
6
9
|
require "uri"
|
7
10
|
|
11
|
+
require "tansaku/monkey_patch"
|
12
|
+
|
8
13
|
module Tansaku
|
9
14
|
class Crawler
|
10
15
|
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
|
@@ -13,7 +18,7 @@ module Tansaku
|
|
13
18
|
|
14
19
|
attr_reader :additional_list
|
15
20
|
attr_reader :host
|
16
|
-
attr_reader :
|
21
|
+
attr_reader :max_concurrent_requests
|
17
22
|
attr_reader :type
|
18
23
|
attr_reader :user_agent
|
19
24
|
|
@@ -21,7 +26,7 @@ module Tansaku
|
|
21
26
|
base_uri,
|
22
27
|
additional_list: nil,
|
23
28
|
host: nil,
|
24
|
-
|
29
|
+
max_concurrent_requests: Etc.nprocessors,
|
25
30
|
type: "all",
|
26
31
|
user_agent: DEFAULT_USER_AGENT
|
27
32
|
)
|
@@ -34,27 +39,39 @@ module Tansaku
|
|
34
39
|
end
|
35
40
|
|
36
41
|
@host = host
|
37
|
-
@
|
42
|
+
@max_concurrent_requests = max_concurrent_requests
|
38
43
|
@type = type
|
39
44
|
@user_agent = user_agent
|
40
45
|
end
|
41
46
|
|
42
|
-
def online?(url)
|
43
|
-
res = head(url)
|
44
|
-
[200, 401, 302].include? res.code.to_i
|
45
|
-
end
|
46
|
-
|
47
47
|
def crawl
|
48
|
-
results =
|
49
|
-
|
50
|
-
|
51
|
-
|
48
|
+
results = {}
|
49
|
+
Async do
|
50
|
+
barrier = Async::Barrier.new
|
51
|
+
semaphore = Async::Semaphore.new(max_concurrent_requests, parent: barrier)
|
52
|
+
internet = Async::HTTP::Internet.new
|
53
|
+
|
54
|
+
paths.each do |path|
|
55
|
+
semaphore.async do
|
56
|
+
url = url_for(path)
|
57
|
+
res = internet.head(url, default_request_headers)
|
58
|
+
|
59
|
+
results[url] = res.status if online?(res.status)
|
60
|
+
rescue Errno::ECONNRESET, Errno::ECONNREFUSED, Errno::EHOSTUNREACH, EOFError, OpenSSL::SSL::SSLError, Async::TimeoutError
|
61
|
+
next
|
62
|
+
end
|
63
|
+
end
|
64
|
+
barrier.wait
|
52
65
|
end
|
53
|
-
results
|
66
|
+
results
|
54
67
|
end
|
55
68
|
|
56
69
|
private
|
57
70
|
|
71
|
+
def online?(status)
|
72
|
+
[200, 204, 301, 302, 307, 401, 403].include? status.to_i
|
73
|
+
end
|
74
|
+
|
58
75
|
def valid_uri?
|
59
76
|
["http", "https"].include? base_uri.scheme
|
60
77
|
end
|
@@ -77,16 +94,8 @@ module Tansaku
|
|
77
94
|
paths.map { |path| url_for path }
|
78
95
|
end
|
79
96
|
|
80
|
-
def
|
81
|
-
|
82
|
-
end
|
83
|
-
|
84
|
-
def head(url)
|
85
|
-
head = Net::HTTP::Head.new(url)
|
86
|
-
head["User-Agent"] = user_agent
|
87
|
-
head["Host"] = host unless host.nil?
|
88
|
-
|
89
|
-
request(head)
|
97
|
+
def default_request_headers
|
98
|
+
@default_request_headers ||= { "host" => host, "user-agent" => user_agent }.compact
|
90
99
|
end
|
91
100
|
end
|
92
101
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "protocol/http1/connection"
|
4
|
+
|
5
|
+
module Protocol
|
6
|
+
module HTTP1
|
7
|
+
class Connection
|
8
|
+
def write_request(authority, method, path, version, headers)
|
9
|
+
host = authority
|
10
|
+
if headers.include?("host")
|
11
|
+
host = headers["host"]
|
12
|
+
headers.delete "host"
|
13
|
+
end
|
14
|
+
|
15
|
+
@stream.write("#{method} #{path} #{version}\r\n")
|
16
|
+
@stream.write("host: #{host}\r\n")
|
17
|
+
|
18
|
+
write_headers(headers)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/tansaku/version.rb
CHANGED
data/tansaku.gemspec
CHANGED
@@ -31,6 +31,7 @@ Gem::Specification.new do |spec|
|
|
31
31
|
spec.add_development_dependency "rspec", "~> 3.9"
|
32
32
|
spec.add_development_dependency "webmock", "~> 3.8"
|
33
33
|
|
34
|
-
spec.add_dependency "
|
34
|
+
spec.add_dependency "async", "~> 1.26"
|
35
|
+
spec.add_dependency "async-http", "~> 0.52"
|
35
36
|
spec.add_dependency "thor", "~> 1.0"
|
36
37
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tansaku
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manabu Niseki
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-07-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -95,19 +95,33 @@ dependencies:
|
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '3.8'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
98
|
+
name: async
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: '1.
|
103
|
+
version: '1.26'
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: '1.
|
110
|
+
version: '1.26'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: async-http
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0.52'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0.52'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: thor
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -149,6 +163,7 @@ files:
|
|
149
163
|
- lib/tansaku/lists/database.txt
|
150
164
|
- lib/tansaku/lists/etc.txt
|
151
165
|
- lib/tansaku/lists/log.txt
|
166
|
+
- lib/tansaku/monkey_patch.rb
|
152
167
|
- lib/tansaku/path.rb
|
153
168
|
- lib/tansaku/version.rb
|
154
169
|
- tansaku.gemspec
|