miteru 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miteru/crawler.rb +6 -6
- data/lib/miteru/database.rb +8 -0
- data/lib/miteru/feeds/feed.rb +33 -0
- data/lib/miteru/feeds.rb +19 -37
- data/lib/miteru/kit.rb +19 -2
- data/lib/miteru/mixin.rb +47 -0
- data/lib/miteru/record.rb +1 -0
- data/lib/miteru/version.rb +1 -1
- data/lib/miteru/website.rb +7 -2
- data/lib/miteru.rb +2 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 050c27599e75745a7c215f08b7ed190b43c70388d974a68945702eefdb25b7c2
|
4
|
+
data.tar.gz: 64c7429a4178febf6984fe3d79d3970781634971405c5d9fdb0748c61b663a32
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ff780a0db3fafdded94261c38272dc0537462ae195f646cceafc2223cd62e61fb8d809ee172f953344314a93f9e6f746b9d3b6b66efdf9b48cda2c5d8c645eb7
|
7
|
+
data.tar.gz: c55dfad5120175ebf43bd01a30b42517a9a5bfd6fddd12d8c6ad07acd3ae02bba67a1bc9101d26fab74406f374f16fbeb3f51e791c678dce749a403e9477f82a
|
data/lib/miteru/crawler.rb
CHANGED
@@ -14,8 +14,8 @@ module Miteru
|
|
14
14
|
@notifier = Notifier.new
|
15
15
|
end
|
16
16
|
|
17
|
-
def crawl(
|
18
|
-
website = Website.new(url)
|
17
|
+
def crawl(entry)
|
18
|
+
website = Website.new(entry.url, entry.source)
|
19
19
|
downloader.download_kits(website.kits) if website.has_kits? && auto_download?
|
20
20
|
notify(website) if website.has_kits? || verbose?
|
21
21
|
rescue OpenSSL::SSL::SSLError, HTTP::Error, Addressable::URI::InvalidURIError => _e
|
@@ -23,11 +23,11 @@ module Miteru
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def execute
|
26
|
-
|
27
|
-
puts "Loaded #{
|
26
|
+
suspicious_entries = feeds.suspicious_entries
|
27
|
+
puts "Loaded #{suspicious_entries.length} URLs to crawl. (crawling in #{threads} threads)" if verbose?
|
28
28
|
|
29
|
-
Parallel.each(
|
30
|
-
crawl
|
29
|
+
Parallel.each(suspicious_entries, in_threads: threads) do |entry|
|
30
|
+
crawl entry
|
31
31
|
end
|
32
32
|
end
|
33
33
|
|
data/lib/miteru/database.rb
CHANGED
@@ -19,6 +19,12 @@ class InitialSchema < ActiveRecord::Migration[6.1]
|
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
|
+
class V11Schema < ActiveRecord::Migration[6.1]
|
23
|
+
def change
|
24
|
+
add_column :records, :source, :string, if_not_exists: true
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
22
28
|
def adapter
|
23
29
|
return "postgresql" if Miteru.configuration.database.start_with?("postgresql://", "postgres://")
|
24
30
|
return "mysql2" if Miteru.configuration.database.start_with?("mysql2://")
|
@@ -44,6 +50,7 @@ module Miteru
|
|
44
50
|
ActiveRecord::Migration.verbose = false
|
45
51
|
|
46
52
|
InitialSchema.migrate(:up)
|
53
|
+
V11Schema.migrate(:up)
|
47
54
|
rescue StandardError => _e
|
48
55
|
# Do nothing
|
49
56
|
end
|
@@ -57,6 +64,7 @@ module Miteru
|
|
57
64
|
return unless ActiveRecord::Base.connected?
|
58
65
|
|
59
66
|
InitialSchema.migrate(:down)
|
67
|
+
V11Schema.migrate(:down)
|
60
68
|
end
|
61
69
|
end
|
62
70
|
end
|
data/lib/miteru/feeds/feed.rb
CHANGED
@@ -3,10 +3,43 @@
|
|
3
3
|
module Miteru
|
4
4
|
class Feeds
|
5
5
|
class Feed
|
6
|
+
include Mixins::URL
|
7
|
+
|
8
|
+
def source
|
9
|
+
@source ||= self.class.to_s.split("::").last
|
10
|
+
end
|
11
|
+
|
12
|
+
#
|
13
|
+
# Return URLs
|
14
|
+
#
|
15
|
+
# @return [Array<String>] URLs
|
16
|
+
#
|
6
17
|
def urls
|
7
18
|
raise NotImplementedError, "You must implement #{self.class}##{__method__}"
|
8
19
|
end
|
9
20
|
|
21
|
+
#
|
22
|
+
# Return entries
|
23
|
+
#
|
24
|
+
# @return [Array<Miteru::Entry>]
|
25
|
+
#
|
26
|
+
def entries
|
27
|
+
breakdowend_urls.map do |url|
|
28
|
+
Entry.new(url, source)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
#
|
33
|
+
# Return breakdowned URLs
|
34
|
+
#
|
35
|
+
# @return [Array<String>] Breakdowned URLs
|
36
|
+
#
|
37
|
+
def breakdowend_urls
|
38
|
+
urls.select { |url| url.start_with?("http://", "https://") }.map do |url|
|
39
|
+
breakdown(url, Miteru.configuration.directory_traveling?)
|
40
|
+
end.flatten.uniq
|
41
|
+
end
|
42
|
+
|
10
43
|
private
|
11
44
|
|
12
45
|
def get(url)
|
data/lib/miteru/feeds.rb
CHANGED
@@ -8,6 +8,18 @@ require_relative "./feeds/urlscan"
|
|
8
8
|
require_relative "./feeds/urlscan_pro"
|
9
9
|
|
10
10
|
module Miteru
|
11
|
+
class Entry
|
12
|
+
# @return [String]
|
13
|
+
attr_reader :url
|
14
|
+
# @return [String]
|
15
|
+
attr_reader :source
|
16
|
+
|
17
|
+
def initialize(url, source)
|
18
|
+
@url = url
|
19
|
+
@source = source
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
11
23
|
class Feeds
|
12
24
|
IGNORE_EXTENSIONS = %w[.htm .html .php .asp .aspx .exe .txt].freeze
|
13
25
|
|
@@ -21,43 +33,13 @@ module Miteru
|
|
21
33
|
].compact
|
22
34
|
end
|
23
35
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
feed.urls.select { |url| url.start_with?("http://", "https://") }
|
32
|
-
end.flatten.uniq
|
33
|
-
|
34
|
-
urls.map { |url| breakdown(url) }.flatten.uniq.sort.each { |url| arr << url }
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def breakdown(url)
|
39
|
-
begin
|
40
|
-
uri = URI.parse(url)
|
41
|
-
rescue URI::InvalidURIError => _e
|
42
|
-
return []
|
43
|
-
end
|
44
|
-
|
45
|
-
base = "#{uri.scheme}://#{uri.hostname}"
|
46
|
-
return [base] unless directory_traveling?
|
47
|
-
|
48
|
-
segments = uri.path.split("/")
|
49
|
-
return [base] if segments.length.zero?
|
50
|
-
|
51
|
-
urls = (0...segments.length).map { |idx| "#{base}#{segments[0..idx].join("/")}" }
|
52
|
-
|
53
|
-
urls.reject do |breakdowned_url|
|
54
|
-
# Reject a url which ends with specific extension names
|
55
|
-
invalid_extension? breakdowned_url
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
def invalid_extension?(url)
|
60
|
-
IGNORE_EXTENSIONS.any? { |ext| url.end_with? ext }
|
36
|
+
#
|
37
|
+
# Returns a list of suspicious entries
|
38
|
+
#
|
39
|
+
# @return [Array<Entry>]
|
40
|
+
#
|
41
|
+
def suspicious_entries
|
42
|
+
@suspicious_entries ||= @feeds.map(&:entries).flatten.uniq(&:url)
|
61
43
|
end
|
62
44
|
end
|
63
45
|
end
|
data/lib/miteru/kit.rb
CHANGED
@@ -9,10 +9,27 @@ module Miteru
|
|
9
9
|
VALID_EXTENSIONS = Miteru.configuration.valid_extensions
|
10
10
|
VALID_MIME_TYPES = Miteru.configuration.valid_mime_types
|
11
11
|
|
12
|
-
|
12
|
+
# @return [String]
|
13
|
+
attr_reader :url
|
13
14
|
|
14
|
-
|
15
|
+
# @return [String]
|
16
|
+
attr_reader :source
|
17
|
+
|
18
|
+
# @return [Integer, nil]
|
19
|
+
attr_reader :status
|
20
|
+
|
21
|
+
# @return [Integer, nil]
|
22
|
+
attr_reader :content_length
|
23
|
+
|
24
|
+
# @return [String, nil]
|
25
|
+
attr_reader :mime_type
|
26
|
+
|
27
|
+
# @return [Hash, nil]
|
28
|
+
attr_reader :headers
|
29
|
+
|
30
|
+
def initialize(url, source)
|
15
31
|
@url = url
|
32
|
+
@source = source
|
16
33
|
|
17
34
|
@content_length = nil
|
18
35
|
@mime_type = nil
|
data/lib/miteru/mixin.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
module Miteru
|
2
|
+
module Mixins
|
3
|
+
module URL
|
4
|
+
IGNORE_EXTENSIONS = %w[.htm .html .php .asp .aspx .exe .txt].freeze
|
5
|
+
|
6
|
+
#
|
7
|
+
# Validate extension of a URL
|
8
|
+
#
|
9
|
+
# @param [String] url
|
10
|
+
#
|
11
|
+
# @return [Boolean]
|
12
|
+
#
|
13
|
+
def invalid_extension?(url)
|
14
|
+
IGNORE_EXTENSIONS.any? { |ext| url.end_with? ext }
|
15
|
+
end
|
16
|
+
|
17
|
+
#
|
18
|
+
# Breakdown a URL into URLs
|
19
|
+
#
|
20
|
+
# @param [String] url
|
21
|
+
# @param [Boolean] enable_directory_traveling
|
22
|
+
#
|
23
|
+
# @return [Array<String>]
|
24
|
+
#
|
25
|
+
def breakdown(url, enable_directory_traveling)
|
26
|
+
begin
|
27
|
+
uri = URI.parse(url)
|
28
|
+
rescue URI::InvalidURIError => _e
|
29
|
+
return []
|
30
|
+
end
|
31
|
+
|
32
|
+
base = "#{uri.scheme}://#{uri.hostname}"
|
33
|
+
return [base] unless enable_directory_traveling
|
34
|
+
|
35
|
+
segments = uri.path.split("/")
|
36
|
+
return [base] if segments.length.zero?
|
37
|
+
|
38
|
+
urls = (0...segments.length).map { |idx| "#{base}#{segments[0..idx].join("/")}" }
|
39
|
+
|
40
|
+
urls.reject do |breakdowned_url|
|
41
|
+
# Reject a url which ends with specific extension names
|
42
|
+
invalid_extension? breakdowned_url
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
data/lib/miteru/record.rb
CHANGED
data/lib/miteru/version.rb
CHANGED
data/lib/miteru/website.rb
CHANGED
@@ -6,10 +6,15 @@ module Miteru
|
|
6
6
|
class Website
|
7
7
|
VALID_EXTENSIONS = Miteru.configuration.valid_extensions
|
8
8
|
|
9
|
+
# @return [String]
|
9
10
|
attr_reader :url
|
10
11
|
|
11
|
-
|
12
|
+
# @return [String]
|
13
|
+
attr_reader :source
|
14
|
+
|
15
|
+
def initialize(url, source)
|
12
16
|
@url = url
|
17
|
+
@source = source
|
13
18
|
end
|
14
19
|
|
15
20
|
def title
|
@@ -18,7 +23,7 @@ module Miteru
|
|
18
23
|
|
19
24
|
def kits
|
20
25
|
@kits ||= links.filter_map do |link|
|
21
|
-
kit = Kit.new(link)
|
26
|
+
kit = Kit.new(link, source)
|
22
27
|
kit.valid? ? kit : nil
|
23
28
|
end
|
24
29
|
end
|
data/lib/miteru.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miteru
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manabu Niseki
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-09-
|
11
|
+
date: 2021-09-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -370,6 +370,7 @@ files:
|
|
370
370
|
- lib/miteru/feeds/urlscan_pro.rb
|
371
371
|
- lib/miteru/http_client.rb
|
372
372
|
- lib/miteru/kit.rb
|
373
|
+
- lib/miteru/mixin.rb
|
373
374
|
- lib/miteru/notifier.rb
|
374
375
|
- lib/miteru/record.rb
|
375
376
|
- lib/miteru/version.rb
|