miteru 1.0.2 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miteru/crawler.rb +6 -6
- data/lib/miteru/database.rb +8 -0
- data/lib/miteru/feeds/feed.rb +33 -0
- data/lib/miteru/feeds.rb +19 -37
- data/lib/miteru/kit.rb +19 -2
- data/lib/miteru/mixin.rb +47 -0
- data/lib/miteru/record.rb +1 -0
- data/lib/miteru/version.rb +1 -1
- data/lib/miteru/website.rb +7 -2
- data/lib/miteru.rb +2 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 050c27599e75745a7c215f08b7ed190b43c70388d974a68945702eefdb25b7c2
|
4
|
+
data.tar.gz: 64c7429a4178febf6984fe3d79d3970781634971405c5d9fdb0748c61b663a32
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ff780a0db3fafdded94261c38272dc0537462ae195f646cceafc2223cd62e61fb8d809ee172f953344314a93f9e6f746b9d3b6b66efdf9b48cda2c5d8c645eb7
|
7
|
+
data.tar.gz: c55dfad5120175ebf43bd01a30b42517a9a5bfd6fddd12d8c6ad07acd3ae02bba67a1bc9101d26fab74406f374f16fbeb3f51e791c678dce749a403e9477f82a
|
data/lib/miteru/crawler.rb
CHANGED
@@ -14,8 +14,8 @@ module Miteru
|
|
14
14
|
@notifier = Notifier.new
|
15
15
|
end
|
16
16
|
|
17
|
-
def crawl(
|
18
|
-
website = Website.new(url)
|
17
|
+
def crawl(entry)
|
18
|
+
website = Website.new(entry.url, entry.source)
|
19
19
|
downloader.download_kits(website.kits) if website.has_kits? && auto_download?
|
20
20
|
notify(website) if website.has_kits? || verbose?
|
21
21
|
rescue OpenSSL::SSL::SSLError, HTTP::Error, Addressable::URI::InvalidURIError => _e
|
@@ -23,11 +23,11 @@ module Miteru
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def execute
|
26
|
-
|
27
|
-
puts "Loaded #{
|
26
|
+
suspicious_entries = feeds.suspicious_entries
|
27
|
+
puts "Loaded #{suspicious_entries.length} URLs to crawl. (crawling in #{threads} threads)" if verbose?
|
28
28
|
|
29
|
-
Parallel.each(
|
30
|
-
crawl
|
29
|
+
Parallel.each(suspicious_entries, in_threads: threads) do |entry|
|
30
|
+
crawl entry
|
31
31
|
end
|
32
32
|
end
|
33
33
|
|
data/lib/miteru/database.rb
CHANGED
@@ -19,6 +19,12 @@ class InitialSchema < ActiveRecord::Migration[6.1]
|
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
|
+
class V11Schema < ActiveRecord::Migration[6.1]
|
23
|
+
def change
|
24
|
+
add_column :records, :source, :string, if_not_exists: true
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
22
28
|
def adapter
|
23
29
|
return "postgresql" if Miteru.configuration.database.start_with?("postgresql://", "postgres://")
|
24
30
|
return "mysql2" if Miteru.configuration.database.start_with?("mysql2://")
|
@@ -44,6 +50,7 @@ module Miteru
|
|
44
50
|
ActiveRecord::Migration.verbose = false
|
45
51
|
|
46
52
|
InitialSchema.migrate(:up)
|
53
|
+
V11Schema.migrate(:up)
|
47
54
|
rescue StandardError => _e
|
48
55
|
# Do nothing
|
49
56
|
end
|
@@ -57,6 +64,7 @@ module Miteru
|
|
57
64
|
return unless ActiveRecord::Base.connected?
|
58
65
|
|
59
66
|
InitialSchema.migrate(:down)
|
67
|
+
V11Schema.migrate(:down)
|
60
68
|
end
|
61
69
|
end
|
62
70
|
end
|
data/lib/miteru/feeds/feed.rb
CHANGED
@@ -3,10 +3,43 @@
|
|
3
3
|
module Miteru
|
4
4
|
class Feeds
|
5
5
|
class Feed
|
6
|
+
include Mixins::URL
|
7
|
+
|
8
|
+
def source
|
9
|
+
@source ||= self.class.to_s.split("::").last
|
10
|
+
end
|
11
|
+
|
12
|
+
#
|
13
|
+
# Return URLs
|
14
|
+
#
|
15
|
+
# @return [Array<String>] URLs
|
16
|
+
#
|
6
17
|
def urls
|
7
18
|
raise NotImplementedError, "You must implement #{self.class}##{__method__}"
|
8
19
|
end
|
9
20
|
|
21
|
+
#
|
22
|
+
# Return entries
|
23
|
+
#
|
24
|
+
# @return [Array<Miteru::Entry>]
|
25
|
+
#
|
26
|
+
def entries
|
27
|
+
breakdowend_urls.map do |url|
|
28
|
+
Entry.new(url, source)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
#
|
33
|
+
# Return breakdowned URLs
|
34
|
+
#
|
35
|
+
# @return [Array<String>] Breakdowned URLs
|
36
|
+
#
|
37
|
+
def breakdowend_urls
|
38
|
+
urls.select { |url| url.start_with?("http://", "https://") }.map do |url|
|
39
|
+
breakdown(url, Miteru.configuration.directory_traveling?)
|
40
|
+
end.flatten.uniq
|
41
|
+
end
|
42
|
+
|
10
43
|
private
|
11
44
|
|
12
45
|
def get(url)
|
data/lib/miteru/feeds.rb
CHANGED
@@ -8,6 +8,18 @@ require_relative "./feeds/urlscan"
|
|
8
8
|
require_relative "./feeds/urlscan_pro"
|
9
9
|
|
10
10
|
module Miteru
|
11
|
+
class Entry
|
12
|
+
# @return [String]
|
13
|
+
attr_reader :url
|
14
|
+
# @return [String]
|
15
|
+
attr_reader :source
|
16
|
+
|
17
|
+
def initialize(url, source)
|
18
|
+
@url = url
|
19
|
+
@source = source
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
11
23
|
class Feeds
|
12
24
|
IGNORE_EXTENSIONS = %w[.htm .html .php .asp .aspx .exe .txt].freeze
|
13
25
|
|
@@ -21,43 +33,13 @@ module Miteru
|
|
21
33
|
].compact
|
22
34
|
end
|
23
35
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
feed.urls.select { |url| url.start_with?("http://", "https://") }
|
32
|
-
end.flatten.uniq
|
33
|
-
|
34
|
-
urls.map { |url| breakdown(url) }.flatten.uniq.sort.each { |url| arr << url }
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def breakdown(url)
|
39
|
-
begin
|
40
|
-
uri = URI.parse(url)
|
41
|
-
rescue URI::InvalidURIError => _e
|
42
|
-
return []
|
43
|
-
end
|
44
|
-
|
45
|
-
base = "#{uri.scheme}://#{uri.hostname}"
|
46
|
-
return [base] unless directory_traveling?
|
47
|
-
|
48
|
-
segments = uri.path.split("/")
|
49
|
-
return [base] if segments.length.zero?
|
50
|
-
|
51
|
-
urls = (0...segments.length).map { |idx| "#{base}#{segments[0..idx].join("/")}" }
|
52
|
-
|
53
|
-
urls.reject do |breakdowned_url|
|
54
|
-
# Reject a url which ends with specific extension names
|
55
|
-
invalid_extension? breakdowned_url
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
def invalid_extension?(url)
|
60
|
-
IGNORE_EXTENSIONS.any? { |ext| url.end_with? ext }
|
36
|
+
#
|
37
|
+
# Returns a list of suspicious entries
|
38
|
+
#
|
39
|
+
# @return [Array<Entry>]
|
40
|
+
#
|
41
|
+
def suspicious_entries
|
42
|
+
@suspicious_entries ||= @feeds.map(&:entries).flatten.uniq(&:url)
|
61
43
|
end
|
62
44
|
end
|
63
45
|
end
|
data/lib/miteru/kit.rb
CHANGED
@@ -9,10 +9,27 @@ module Miteru
|
|
9
9
|
VALID_EXTENSIONS = Miteru.configuration.valid_extensions
|
10
10
|
VALID_MIME_TYPES = Miteru.configuration.valid_mime_types
|
11
11
|
|
12
|
-
|
12
|
+
# @return [String]
|
13
|
+
attr_reader :url
|
13
14
|
|
14
|
-
|
15
|
+
# @return [String]
|
16
|
+
attr_reader :source
|
17
|
+
|
18
|
+
# @return [Integer, nil]
|
19
|
+
attr_reader :status
|
20
|
+
|
21
|
+
# @return [Integer, nil]
|
22
|
+
attr_reader :content_length
|
23
|
+
|
24
|
+
# @return [String, nil]
|
25
|
+
attr_reader :mime_type
|
26
|
+
|
27
|
+
# @return [Hash, nil]
|
28
|
+
attr_reader :headers
|
29
|
+
|
30
|
+
def initialize(url, source)
|
15
31
|
@url = url
|
32
|
+
@source = source
|
16
33
|
|
17
34
|
@content_length = nil
|
18
35
|
@mime_type = nil
|
data/lib/miteru/mixin.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
module Miteru
|
2
|
+
module Mixins
|
3
|
+
module URL
|
4
|
+
IGNORE_EXTENSIONS = %w[.htm .html .php .asp .aspx .exe .txt].freeze
|
5
|
+
|
6
|
+
#
|
7
|
+
# Validate extension of a URL
|
8
|
+
#
|
9
|
+
# @param [String] url
|
10
|
+
#
|
11
|
+
# @return [Boolean]
|
12
|
+
#
|
13
|
+
def invalid_extension?(url)
|
14
|
+
IGNORE_EXTENSIONS.any? { |ext| url.end_with? ext }
|
15
|
+
end
|
16
|
+
|
17
|
+
#
|
18
|
+
# Breakdown a URL into URLs
|
19
|
+
#
|
20
|
+
# @param [String] url
|
21
|
+
# @param [Boolean] enable_directory_traveling
|
22
|
+
#
|
23
|
+
# @return [Array<String>]
|
24
|
+
#
|
25
|
+
def breakdown(url, enable_directory_traveling)
|
26
|
+
begin
|
27
|
+
uri = URI.parse(url)
|
28
|
+
rescue URI::InvalidURIError => _e
|
29
|
+
return []
|
30
|
+
end
|
31
|
+
|
32
|
+
base = "#{uri.scheme}://#{uri.hostname}"
|
33
|
+
return [base] unless enable_directory_traveling
|
34
|
+
|
35
|
+
segments = uri.path.split("/")
|
36
|
+
return [base] if segments.length.zero?
|
37
|
+
|
38
|
+
urls = (0...segments.length).map { |idx| "#{base}#{segments[0..idx].join("/")}" }
|
39
|
+
|
40
|
+
urls.reject do |breakdowned_url|
|
41
|
+
# Reject a url which ends with specific extension names
|
42
|
+
invalid_extension? breakdowned_url
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
data/lib/miteru/record.rb
CHANGED
data/lib/miteru/version.rb
CHANGED
data/lib/miteru/website.rb
CHANGED
@@ -6,10 +6,15 @@ module Miteru
|
|
6
6
|
class Website
|
7
7
|
VALID_EXTENSIONS = Miteru.configuration.valid_extensions
|
8
8
|
|
9
|
+
# @return [String]
|
9
10
|
attr_reader :url
|
10
11
|
|
11
|
-
|
12
|
+
# @return [String]
|
13
|
+
attr_reader :source
|
14
|
+
|
15
|
+
def initialize(url, source)
|
12
16
|
@url = url
|
17
|
+
@source = source
|
13
18
|
end
|
14
19
|
|
15
20
|
def title
|
@@ -18,7 +23,7 @@ module Miteru
|
|
18
23
|
|
19
24
|
def kits
|
20
25
|
@kits ||= links.filter_map do |link|
|
21
|
-
kit = Kit.new(link)
|
26
|
+
kit = Kit.new(link, source)
|
22
27
|
kit.valid? ? kit : nil
|
23
28
|
end
|
24
29
|
end
|
data/lib/miteru.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miteru
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manabu Niseki
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-09-
|
11
|
+
date: 2021-09-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -370,6 +370,7 @@ files:
|
|
370
370
|
- lib/miteru/feeds/urlscan_pro.rb
|
371
371
|
- lib/miteru/http_client.rb
|
372
372
|
- lib/miteru/kit.rb
|
373
|
+
- lib/miteru/mixin.rb
|
373
374
|
- lib/miteru/notifier.rb
|
374
375
|
- lib/miteru/record.rb
|
375
376
|
- lib/miteru/version.rb
|