tansaku 0.3.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 50036fe49b8faa2f534977c93c8352afdd2db88542bcf7b78323a0a410f4fb18
4
- data.tar.gz: cad855e053c5cad7ebac3f8cef80e546b900a87dd30ad3bae53a9c7503e76c7d
3
+ metadata.gz: a179d2d95fc0e6c78d908ff8379ecefb609448f38284f77c21fcd92dd8a0b7ca
4
+ data.tar.gz: 0eaecd595420390290a4e5b9f8a759b229e096037e60d058d58fd6af02dfe9f0
5
5
  SHA512:
6
- metadata.gz: f4c0c1b4011de97cbbe3eeed76b2f0f07e17487ec0f00e092481ff9f37a0622d161eb1d37dcb938c16c131321da88e04c5a12e7cb1222dce729feb4a58efbf8a
7
- data.tar.gz: 0d03f7cae1bde0eb320ca41a3af43c216e13a264fd61966a7bbf0bba09a0a4c630e82659a88d4c0525109ca148796098c895035bf16fa1b0a3a3fadb7c17aa85
6
+ metadata.gz: a4b252ff9c13cbe70109f8a15b77e9923672f15c5de2f93f26af6e6a0031ebdf0ed2c246e76968843e66f6e4eda0cdeae067b3886c8525237f5a412032780b63
7
+ data.tar.gz: '01952d73467ad3fafce9b3726675c7f145f7db359052cb6682e533f7faa2c42022f8e2f38e07a3f63fe132ee36d8a8dfcbfdc61563c84d9b55d3a8536b1b3146'
data/README.md CHANGED
@@ -37,7 +37,7 @@ Usage:
37
37
  Options:
38
38
  [--additional-list=ADDITIONAL_LIST] # Path to the file which includes additonal paths to crawl
39
39
  [--host=HOST] # Host header to use
40
- [--threads=N] # Number of threads to use
40
+ [--max-concurrent-requests=N] # Number of concurrent requests to use
41
41
  [--type=TYPE] # Type of a list to crawl (admin, backup, database, etc, log or all)
42
42
  # Default: all
43
43
  [--user-agent=USER_AGENT] # User-Agent header to use
@@ -9,7 +9,7 @@ module Tansaku
9
9
  desc "crawl URL", "Crawl a given URL"
10
10
  method_option :additional_list, desc: "Path to the file which includes additonal paths to crawl"
11
11
  method_option :host, type: :string, desc: "Host header to use"
12
- method_option :threads, type: :numeric, desc: "Number of threads to use"
12
+ method_option :max_concurrent_requests, type: :numeric, desc: "Number of concurrent requests to use"
13
13
  method_option :type, desc: "Type of a list to crawl (admin, backup, database, etc, log or all)", default: "all"
14
14
  method_option :user_agent, type: :string, desc: "User-Agent header to use"
15
15
  def crawl(url)
@@ -1,10 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "async/http/internet"
4
+ require "async"
5
+ require "async/barrier"
6
+ require "async/semaphore"
3
7
  require "cgi"
4
- require "net/http"
5
- require "parallel"
8
+ require "etc"
6
9
  require "uri"
7
10
 
11
+ require "tansaku/monkey_patch"
12
+
8
13
  module Tansaku
9
14
  class Crawler
10
15
  DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
@@ -13,7 +18,7 @@ module Tansaku
13
18
 
14
19
  attr_reader :additional_list
15
20
  attr_reader :host
16
- attr_reader :threads
21
+ attr_reader :max_concurrent_requests
17
22
  attr_reader :type
18
23
  attr_reader :user_agent
19
24
 
@@ -21,7 +26,7 @@ module Tansaku
21
26
  base_uri,
22
27
  additional_list: nil,
23
28
  host: nil,
24
- threads: Parallel.processor_count,
29
+ max_concurrent_requests: Etc.nprocessors,
25
30
  type: "all",
26
31
  user_agent: DEFAULT_USER_AGENT
27
32
  )
@@ -34,27 +39,39 @@ module Tansaku
34
39
  end
35
40
 
36
41
  @host = host
37
- @threads = threads
42
+ @max_concurrent_requests = max_concurrent_requests
38
43
  @type = type
39
44
  @user_agent = user_agent
40
45
  end
41
46
 
42
- def online?(url)
43
- res = head(url)
44
- [200, 401, 302].include? res.code.to_i
45
- end
46
-
47
47
  def crawl
48
- results = Parallel.map(urls, in_threads: threads) do |url|
49
- url if online?(url)
50
- rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError => _e
51
- nil
48
+ results = {}
49
+ Async do
50
+ barrier = Async::Barrier.new
51
+ semaphore = Async::Semaphore.new(max_concurrent_requests, parent: barrier)
52
+ internet = Async::HTTP::Internet.new
53
+
54
+ paths.each do |path|
55
+ semaphore.async do
56
+ url = url_for(path)
57
+ res = internet.head(url, default_request_headers)
58
+
59
+ results[url] = res.status if online?(res.status)
60
+ rescue Errno::ECONNRESET, Errno::ECONNREFUSED, Errno::EHOSTUNREACH, EOFError, OpenSSL::SSL::SSLError, Async::TimeoutError
61
+ next
62
+ end
63
+ end
64
+ barrier.wait
52
65
  end
53
- results.compact
66
+ results
54
67
  end
55
68
 
56
69
  private
57
70
 
71
+ def online?(status)
72
+ [200, 204, 301, 302, 307, 401, 403].include? status.to_i
73
+ end
74
+
58
75
  def valid_uri?
59
76
  ["http", "https"].include? base_uri.scheme
60
77
  end
@@ -77,16 +94,8 @@ module Tansaku
77
94
  paths.map { |path| url_for path }
78
95
  end
79
96
 
80
- def request(req)
81
- Net::HTTP.start(base_uri.host, base_uri.port) { |http| http.request(req) }
82
- end
83
-
84
- def head(url)
85
- head = Net::HTTP::Head.new(url)
86
- head["User-Agent"] = user_agent
87
- head["Host"] = host unless host.nil?
88
-
89
- request(head)
97
+ def default_request_headers
98
+ @default_request_headers ||= { "host" => host, "user-agent" => user_agent }.compact
90
99
  end
91
100
  end
92
101
  end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "protocol/http1/connection"
4
+
5
+ module Protocol
6
+ module HTTP1
7
+ class Connection
8
+ def write_request(authority, method, path, version, headers)
9
+ host = authority
10
+ if headers.include?("host")
11
+ host = headers["host"]
12
+ headers.delete "host"
13
+ end
14
+
15
+ @stream.write("#{method} #{path} #{version}\r\n")
16
+ @stream.write("host: #{host}\r\n")
17
+
18
+ write_headers(headers)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tansaku
4
- VERSION = "0.3.0"
4
+ VERSION = "1.0.0"
5
5
  end
@@ -31,6 +31,7 @@ Gem::Specification.new do |spec|
31
31
  spec.add_development_dependency "rspec", "~> 3.9"
32
32
  spec.add_development_dependency "webmock", "~> 3.8"
33
33
 
34
- spec.add_dependency "parallel", "~> 1.19"
34
+ spec.add_dependency "async", "~> 1.26"
35
+ spec.add_dependency "async-http", "~> 0.52"
35
36
  spec.add_dependency "thor", "~> 1.0"
36
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tansaku
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Manabu Niseki
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-06-30 00:00:00.000000000 Z
11
+ date: 2020-07-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -95,19 +95,33 @@ dependencies:
95
95
  - !ruby/object:Gem::Version
96
96
  version: '3.8'
97
97
  - !ruby/object:Gem::Dependency
98
- name: parallel
98
+ name: async
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
101
  - - "~>"
102
102
  - !ruby/object:Gem::Version
103
- version: '1.19'
103
+ version: '1.26'
104
104
  type: :runtime
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
- version: '1.19'
110
+ version: '1.26'
111
+ - !ruby/object:Gem::Dependency
112
+ name: async-http
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '0.52'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '0.52'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: thor
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -149,6 +163,7 @@ files:
149
163
  - lib/tansaku/lists/database.txt
150
164
  - lib/tansaku/lists/etc.txt
151
165
  - lib/tansaku/lists/log.txt
166
+ - lib/tansaku/monkey_patch.rb
152
167
  - lib/tansaku/path.rb
153
168
  - lib/tansaku/version.rb
154
169
  - tansaku.gemspec