tansaku 0.3.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 50036fe49b8faa2f534977c93c8352afdd2db88542bcf7b78323a0a410f4fb18
4
- data.tar.gz: cad855e053c5cad7ebac3f8cef80e546b900a87dd30ad3bae53a9c7503e76c7d
3
+ metadata.gz: a179d2d95fc0e6c78d908ff8379ecefb609448f38284f77c21fcd92dd8a0b7ca
4
+ data.tar.gz: 0eaecd595420390290a4e5b9f8a759b229e096037e60d058d58fd6af02dfe9f0
5
5
  SHA512:
6
- metadata.gz: f4c0c1b4011de97cbbe3eeed76b2f0f07e17487ec0f00e092481ff9f37a0622d161eb1d37dcb938c16c131321da88e04c5a12e7cb1222dce729feb4a58efbf8a
7
- data.tar.gz: 0d03f7cae1bde0eb320ca41a3af43c216e13a264fd61966a7bbf0bba09a0a4c630e82659a88d4c0525109ca148796098c895035bf16fa1b0a3a3fadb7c17aa85
6
+ metadata.gz: a4b252ff9c13cbe70109f8a15b77e9923672f15c5de2f93f26af6e6a0031ebdf0ed2c246e76968843e66f6e4eda0cdeae067b3886c8525237f5a412032780b63
7
+ data.tar.gz: '01952d73467ad3fafce9b3726675c7f145f7db359052cb6682e533f7faa2c42022f8e2f38e07a3f63fe132ee36d8a8dfcbfdc61563c84d9b55d3a8536b1b3146'
data/README.md CHANGED
@@ -37,7 +37,7 @@ Usage:
37
37
  Options:
38
38
  [--additional-list=ADDITIONAL_LIST] # Path to the file which includes additonal paths to crawl
39
39
  [--host=HOST] # Host header to use
40
- [--threads=N] # Number of threads to use
40
+ [--max-concurrent-requests=N] # Number of concurrent requests to use
41
41
  [--type=TYPE] # Type of a list to crawl (admin, backup, database, etc, log or all)
42
42
  # Default: all
43
43
  [--user-agent=USER_AGENT] # User-Agent header to use
@@ -9,7 +9,7 @@ module Tansaku
9
9
  desc "crawl URL", "Crawl a given URL"
10
10
  method_option :additional_list, desc: "Path to the file which includes additonal paths to crawl"
11
11
  method_option :host, type: :string, desc: "Host header to use"
12
- method_option :threads, type: :numeric, desc: "Number of threads to use"
12
+ method_option :max_concurrent_requests, type: :numeric, desc: "Number of concurrent requests to use"
13
13
  method_option :type, desc: "Type of a list to crawl (admin, backup, database, etc, log or all)", default: "all"
14
14
  method_option :user_agent, type: :string, desc: "User-Agent header to use"
15
15
  def crawl(url)
@@ -1,10 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "async/http/internet"
4
+ require "async"
5
+ require "async/barrier"
6
+ require "async/semaphore"
3
7
  require "cgi"
4
- require "net/http"
5
- require "parallel"
8
+ require "etc"
6
9
  require "uri"
7
10
 
11
+ require "tansaku/monkey_patch"
12
+
8
13
  module Tansaku
9
14
  class Crawler
10
15
  DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
@@ -13,7 +18,7 @@ module Tansaku
13
18
 
14
19
  attr_reader :additional_list
15
20
  attr_reader :host
16
- attr_reader :threads
21
+ attr_reader :max_concurrent_requests
17
22
  attr_reader :type
18
23
  attr_reader :user_agent
19
24
 
@@ -21,7 +26,7 @@ module Tansaku
21
26
  base_uri,
22
27
  additional_list: nil,
23
28
  host: nil,
24
- threads: Parallel.processor_count,
29
+ max_concurrent_requests: Etc.nprocessors,
25
30
  type: "all",
26
31
  user_agent: DEFAULT_USER_AGENT
27
32
  )
@@ -34,27 +39,39 @@ module Tansaku
34
39
  end
35
40
 
36
41
  @host = host
37
- @threads = threads
42
+ @max_concurrent_requests = max_concurrent_requests
38
43
  @type = type
39
44
  @user_agent = user_agent
40
45
  end
41
46
 
42
- def online?(url)
43
- res = head(url)
44
- [200, 401, 302].include? res.code.to_i
45
- end
46
-
47
47
  def crawl
48
- results = Parallel.map(urls, in_threads: threads) do |url|
49
- url if online?(url)
50
- rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError => _e
51
- nil
48
+ results = {}
49
+ Async do
50
+ barrier = Async::Barrier.new
51
+ semaphore = Async::Semaphore.new(max_concurrent_requests, parent: barrier)
52
+ internet = Async::HTTP::Internet.new
53
+
54
+ paths.each do |path|
55
+ semaphore.async do
56
+ url = url_for(path)
57
+ res = internet.head(url, default_request_headers)
58
+
59
+ results[url] = res.status if online?(res.status)
60
+ rescue Errno::ECONNRESET, Errno::ECONNREFUSED, Errno::EHOSTUNREACH, EOFError, OpenSSL::SSL::SSLError, Async::TimeoutError
61
+ next
62
+ end
63
+ end
64
+ barrier.wait
52
65
  end
53
- results.compact
66
+ results
54
67
  end
55
68
 
56
69
  private
57
70
 
71
+ def online?(status)
72
+ [200, 204, 301, 302, 307, 401, 403].include? status.to_i
73
+ end
74
+
58
75
  def valid_uri?
59
76
  ["http", "https"].include? base_uri.scheme
60
77
  end
@@ -77,16 +94,8 @@ module Tansaku
77
94
  paths.map { |path| url_for path }
78
95
  end
79
96
 
80
- def request(req)
81
- Net::HTTP.start(base_uri.host, base_uri.port) { |http| http.request(req) }
82
- end
83
-
84
- def head(url)
85
- head = Net::HTTP::Head.new(url)
86
- head["User-Agent"] = user_agent
87
- head["Host"] = host unless host.nil?
88
-
89
- request(head)
97
+ def default_request_headers
98
+ @default_request_headers ||= { "host" => host, "user-agent" => user_agent }.compact
90
99
  end
91
100
  end
92
101
  end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "protocol/http1/connection"
4
+
5
+ module Protocol
6
+ module HTTP1
7
+ class Connection
8
+ def write_request(authority, method, path, version, headers)
9
+ host = authority
10
+ if headers.include?("host")
11
+ host = headers["host"]
12
+ headers.delete "host"
13
+ end
14
+
15
+ @stream.write("#{method} #{path} #{version}\r\n")
16
+ @stream.write("host: #{host}\r\n")
17
+
18
+ write_headers(headers)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tansaku
4
- VERSION = "0.3.0"
4
+ VERSION = "1.0.0"
5
5
  end
@@ -31,6 +31,7 @@ Gem::Specification.new do |spec|
31
31
  spec.add_development_dependency "rspec", "~> 3.9"
32
32
  spec.add_development_dependency "webmock", "~> 3.8"
33
33
 
34
- spec.add_dependency "parallel", "~> 1.19"
34
+ spec.add_dependency "async", "~> 1.26"
35
+ spec.add_dependency "async-http", "~> 0.52"
35
36
  spec.add_dependency "thor", "~> 1.0"
36
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tansaku
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Manabu Niseki
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-06-30 00:00:00.000000000 Z
11
+ date: 2020-07-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -95,19 +95,33 @@ dependencies:
95
95
  - !ruby/object:Gem::Version
96
96
  version: '3.8'
97
97
  - !ruby/object:Gem::Dependency
98
- name: parallel
98
+ name: async
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
101
  - - "~>"
102
102
  - !ruby/object:Gem::Version
103
- version: '1.19'
103
+ version: '1.26'
104
104
  type: :runtime
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
- version: '1.19'
110
+ version: '1.26'
111
+ - !ruby/object:Gem::Dependency
112
+ name: async-http
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '0.52'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '0.52'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: thor
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -149,6 +163,7 @@ files:
149
163
  - lib/tansaku/lists/database.txt
150
164
  - lib/tansaku/lists/etc.txt
151
165
  - lib/tansaku/lists/log.txt
166
+ - lib/tansaku/monkey_patch.rb
152
167
  - lib/tansaku/path.rb
153
168
  - lib/tansaku/version.rb
154
169
  - tansaku.gemspec