webmole 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +49 -0
- data/bin/webmole +82 -0
- data/lib/webmole/crawler.rb +161 -0
- data/lib/webmole/gathering_mode.rb +45 -0
- data/lib/webmole/output_formatter.rb +84 -0
- data/lib/webmole/scraper.rb +77 -0
- data/lib/webmole/user_agent_switcher.rb +41 -0
- data/lib/webmole/version.rb +3 -0
- data/lib/webmole.rb +51 -0
- data/webmole.gemspec +30 -0
- metadata +168 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: df9c01135d0b83dafa566f883f43f35defcc819873709a8da2a176b149444451
|
4
|
+
data.tar.gz: 11c6c44c605c0d740bc4ebc244b6e34acd56f637f80fbde72b177133df7fee6f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7203dd2fc644fe3cee8a6c4c6c31bcb100596e157ad5ab274d6774d42a36779c263b371e7016c1a532d557f13edc5df73aaa72905fc751f4633868b245be1919
|
7
|
+
data.tar.gz: 60d83b4f4e70d4da6d2f7c086f90b15696a0ad97042e34a97a0528c02f3eac9883ca32d6b4ffa8f334451eefd39eae006675465a0b72e191640af5326b21e2d4
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2024 Subnetmasked
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# WebMole
|
2
|
+
|
3
|
+
WebMole is a powerful web scraping tool built in Ruby. It allows you to crawl websites and extract various types of information such as emails, phone numbers, URLs, social media handles, addresses, and more.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'webmole'
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle install
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install webmole
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
To use WebMole, run the following command:
|
24
|
+
|
25
|
+
```
|
26
|
+
webmole -u https://example.com -s emails -d 2
|
27
|
+
```
|
28
|
+
|
29
|
+
This will crawl https://example.com to a depth of 2 and extract all email addresses found.
|
30
|
+
|
31
|
+
For more options, run:
|
32
|
+
|
33
|
+
```
|
34
|
+
webmole --help
|
35
|
+
```
|
36
|
+
|
37
|
+
## Development
|
38
|
+
|
39
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
40
|
+
|
41
|
+
To install this gem onto your local machine, run `bundle exec rake install`.
|
42
|
+
|
43
|
+
## Contributing
|
44
|
+
|
45
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/yourusername/webmole.
|
46
|
+
|
47
|
+
## License
|
48
|
+
|
49
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/bin/webmole
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'webmole'
|
5
|
+
|
6
|
+
options = {}
|
7
|
+
OptionParser.new do |opts|
|
8
|
+
opts.banner = "Usage: webmole [options]"
|
9
|
+
|
10
|
+
opts.on("-v", "--version", "Show version") do
|
11
|
+
puts "WebMole version #{WebMole::VERSION}"
|
12
|
+
exit
|
13
|
+
end
|
14
|
+
|
15
|
+
opts.on("-d", "--depth DEPTH", Integer, "Crawl depth (default: 3)") do |d|
|
16
|
+
options[:depth] = d
|
17
|
+
end
|
18
|
+
|
19
|
+
opts.on("-u", "--url URL", "Starting URL to crawl") do |url|
|
20
|
+
options[:url] = url
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on("-h", "--help", "Show this help message") do
|
24
|
+
puts opts
|
25
|
+
exit
|
26
|
+
end
|
27
|
+
|
28
|
+
opts.on("--delay SECONDS", Float, "Delay between requests in seconds (default: 1.0)") do |delay|
|
29
|
+
options[:delay] = delay
|
30
|
+
end
|
31
|
+
|
32
|
+
opts.on("-o", "--output FILE", "Output file") do |file|
|
33
|
+
options[:output] = file
|
34
|
+
end
|
35
|
+
|
36
|
+
opts.on("-f", "--format FORMAT", "Output format (txt, yaml, or csv)") do |format|
|
37
|
+
options[:format] = format.downcase
|
38
|
+
end
|
39
|
+
|
40
|
+
opts.on("-t", "--threads NUM", Integer, "Number of threads to use (default: 1)") do |t|
|
41
|
+
options[:threads] = t
|
42
|
+
end
|
43
|
+
|
44
|
+
opts.on("-s", "--scrape OPTION", "Scrape option (emails, phone_numbers, urls, social_media, addresses, credit_cards, custom)") do |option|
|
45
|
+
options[:scrape_option] = option.to_sym
|
46
|
+
end
|
47
|
+
|
48
|
+
opts.on("-p", "--pattern REGEX", "Custom regex pattern to search for (use with -s custom)") do |pattern|
|
49
|
+
options[:pattern] = pattern
|
50
|
+
end
|
51
|
+
|
52
|
+
opts.on("--verbose", "Enable verbose output") do
|
53
|
+
options[:verbose] = true
|
54
|
+
end
|
55
|
+
|
56
|
+
opts.on("--restrict-domain", "Restrict crawling to the initial domain") do
|
57
|
+
options[:restrict_domain] = true
|
58
|
+
end
|
59
|
+
|
60
|
+
opts.on("--timeout SECONDS", Integer, "Set a timeout for the crawl (default: 300 seconds)") do |t|
|
61
|
+
options[:timeout] = t
|
62
|
+
end
|
63
|
+
|
64
|
+
opts.on("--save-source-url", "Save the source URL for each match") do
|
65
|
+
options[:save_source_url] = true
|
66
|
+
end
|
67
|
+
|
68
|
+
opts.on("-g", "--gathering", "Enable gathering mode") do
|
69
|
+
options[:gathering_mode] = true
|
70
|
+
end
|
71
|
+
end.parse!
|
72
|
+
|
73
|
+
WebMole.print_banner
|
74
|
+
WebMole.print_disclaimer
|
75
|
+
|
76
|
+
begin
|
77
|
+
puts "\nPress Enter to continue or Ctrl+C to exit.".colorize(:yellow)
|
78
|
+
gets
|
79
|
+
WebMole.run(options)
|
80
|
+
rescue Interrupt
|
81
|
+
puts "\nScript terminated by user.".colorize(:red)
|
82
|
+
end
|
@@ -0,0 +1,161 @@
|
|
1
|
+
require_relative 'scraper'
|
2
|
+
require_relative 'output_formatter'
|
3
|
+
require_relative 'user_agent_switcher'
|
4
|
+
require_relative 'gathering_mode'
|
5
|
+
require 'open-uri'
|
6
|
+
|
7
|
+
module WebMole
|
8
|
+
class Crawler
|
9
|
+
def initialize(options)
|
10
|
+
@url = options[:url]
|
11
|
+
@depth = options[:depth] || 3
|
12
|
+
@delay = options[:delay] || 1.0
|
13
|
+
@threads = options[:threads] || 1
|
14
|
+
@verbose = options[:verbose]
|
15
|
+
@restrict_domain = options[:restrict_domain]
|
16
|
+
@initial_domain = URI(@url).host
|
17
|
+
@timeout = options[:timeout] || 300
|
18
|
+
@urls_to_scrape = Set.new
|
19
|
+
@visited = Set.new
|
20
|
+
@mutex = Mutex.new
|
21
|
+
@output = options[:output]
|
22
|
+
@format = options[:format]
|
23
|
+
@save_source_url = options[:save_source_url]
|
24
|
+
@user_agent_switcher = UserAgentSwitcher.new
|
25
|
+
@gathering_mode = options[:gathering_mode]
|
26
|
+
@scraper = Scraper.new(options[:scrape_option], options[:pattern], @gathering_mode)
|
27
|
+
GatheringMode.setup if @gathering_mode
|
28
|
+
end
|
29
|
+
|
30
|
+
def crawl
|
31
|
+
start_time = Time.now
|
32
|
+
|
33
|
+
puts "Phase 1: Discovering URLs to scrape...".colorize(:cyan)
|
34
|
+
discover_urls
|
35
|
+
|
36
|
+
puts "\nPhase 2: Scraping discovered URLs...".colorize(:cyan)
|
37
|
+
process_urls
|
38
|
+
|
39
|
+
end_time = Time.now
|
40
|
+
print_summary(start_time, end_time)
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def discover_urls
|
46
|
+
queue = Queue.new
|
47
|
+
queue.push([@url, @depth])
|
48
|
+
@visited.add(@url)
|
49
|
+
|
50
|
+
thread_count = [@threads, 1].max # Ensure at least 1 thread
|
51
|
+
discovery_threads = thread_count.times.map do
|
52
|
+
Thread.new do
|
53
|
+
while !queue.empty?
|
54
|
+
url, depth = queue.pop(true) rescue nil
|
55
|
+
break unless url && depth
|
56
|
+
discover_links(url, depth, queue)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
discovery_threads.each(&:join)
|
62
|
+
puts "Discovered #{@urls_to_scrape.size} URLs to scrape.".colorize(:green)
|
63
|
+
end
|
64
|
+
|
65
|
+
def discover_links(url, depth, queue)
|
66
|
+
return if depth < 0
|
67
|
+
|
68
|
+
puts "Discovering: #{url}".colorize(:light_blue) if @verbose
|
69
|
+
|
70
|
+
begin
|
71
|
+
doc = fetch_page(url)
|
72
|
+
find_links(doc, url).each do |link|
|
73
|
+
next if @restrict_domain && URI(link).host != @initial_domain
|
74
|
+
if @visited.add?(link)
|
75
|
+
@urls_to_scrape.add(link)
|
76
|
+
queue.push([link, depth - 1])
|
77
|
+
end
|
78
|
+
end
|
79
|
+
rescue StandardError => e
|
80
|
+
puts "Error discovering links from #{url}: #{e.message}".colorize(:red) if @verbose
|
81
|
+
ensure
|
82
|
+
sleep(@delay)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def find_links(doc, base_url)
|
87
|
+
base_uri = URI(base_url)
|
88
|
+
doc.css('a').map { |link| link['href'] }.compact.map do |href|
|
89
|
+
begin
|
90
|
+
uri = URI(href)
|
91
|
+
if uri.scheme.nil?
|
92
|
+
URI.join(base_uri, href).to_s
|
93
|
+
elsif ['http', 'https'].include?(uri.scheme.downcase)
|
94
|
+
uri.to_s
|
95
|
+
else
|
96
|
+
nil
|
97
|
+
end
|
98
|
+
rescue URI::InvalidURIError, NoMethodError
|
99
|
+
nil
|
100
|
+
end
|
101
|
+
end.compact
|
102
|
+
end
|
103
|
+
|
104
|
+
def process_urls
|
105
|
+
total_urls = @urls_to_scrape.size
|
106
|
+
processed = 0
|
107
|
+
start_time = Time.now
|
108
|
+
|
109
|
+
@urls_to_scrape.each do |url|
|
110
|
+
crawl_url(url)
|
111
|
+
processed += 1
|
112
|
+
|
113
|
+
elapsed_time = Time.now - start_time
|
114
|
+
avg_time_per_url = elapsed_time / processed
|
115
|
+
estimated_time_left = avg_time_per_url * (total_urls - processed)
|
116
|
+
|
117
|
+
puts "Processed: #{processed}/#{total_urls} | " \
|
118
|
+
"Elapsed: #{format_time(elapsed_time)} | " \
|
119
|
+
"Est. Left: #{format_time(estimated_time_left)}".colorize(:cyan)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def crawl_url(url)
|
124
|
+
puts "Crawling: #{url}".colorize(:light_blue)
|
125
|
+
|
126
|
+
begin
|
127
|
+
doc = fetch_page(url)
|
128
|
+
@scraper.scrape(doc.text, url)
|
129
|
+
rescue OpenURI::HTTPError => e
|
130
|
+
puts "HTTP Error crawling #{url}: #{e.message}".colorize(:red)
|
131
|
+
rescue SocketError, URI::InvalidURIError => e
|
132
|
+
puts "Error crawling #{url}: #{e.message}".colorize(:red)
|
133
|
+
rescue StandardError => e
|
134
|
+
puts "Unexpected error crawling #{url}: #{e.message}".colorize(:red)
|
135
|
+
ensure
|
136
|
+
sleep(@delay)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def fetch_page(url)
|
141
|
+
user_agent = @user_agent_switcher.random_user_agent
|
142
|
+
Nokogiri::HTML(URI.open(url, 'User-Agent' => user_agent))
|
143
|
+
end
|
144
|
+
|
145
|
+
def format_time(seconds)
|
146
|
+
minutes, seconds = seconds.divmod(60)
|
147
|
+
hours, minutes = minutes.divmod(60)
|
148
|
+
[hours, minutes, seconds].map { |t| t.to_i.to_s.rjust(2, '0') }.join(':')
|
149
|
+
end
|
150
|
+
|
151
|
+
def print_summary(start_time, end_time)
|
152
|
+
puts "\nCrawling complete!".colorize(:green)
|
153
|
+
puts "Total URLs processed: #{@urls_to_scrape.size}".colorize(:cyan)
|
154
|
+
puts "Total matches found: #{@scraper.matches.size}".colorize(:cyan)
|
155
|
+
puts "Total time: #{format_time(end_time - start_time)}".colorize(:cyan)
|
156
|
+
|
157
|
+
OutputFormatter.new(@format, @output, @save_source_url).format_results(@scraper.matches)
|
158
|
+
GatheringMode.print_summary if @gathering_mode
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
module WebMole
|
5
|
+
class GatheringMode
|
6
|
+
@gathering_dir = File.join(Dir.home, '.webmole_gathered')
|
7
|
+
@gathered_data = Hash.new { |h, k| h[k] = Set.new }
|
8
|
+
|
9
|
+
class << self
|
10
|
+
def setup
|
11
|
+
FileUtils.mkdir_p(@gathering_dir) unless File.directory?(@gathering_dir)
|
12
|
+
load_existing_data
|
13
|
+
end
|
14
|
+
|
15
|
+
def save_to_gathering(match, type)
|
16
|
+
@gathered_data[type].add(match)
|
17
|
+
save_data(type)
|
18
|
+
end
|
19
|
+
|
20
|
+
def print_summary
|
21
|
+
puts "\nGathering mode summary:".colorize(:green)
|
22
|
+
@gathered_data.each do |type, matches|
|
23
|
+
puts "#{type}: #{matches.size} unique entries".colorize(:cyan)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def load_existing_data
|
30
|
+
Dir.glob(File.join(@gathering_dir, 'gathered_*.txt')).each do |file|
|
31
|
+
type = File.basename(file, '.txt').sub('gathered_', '')
|
32
|
+
File.readlines(file, chomp: true).each do |line|
|
33
|
+
@gathered_data[type].add(line)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def save_data(type)
|
39
|
+
File.open(File.join(@gathering_dir, "gathered_#{type}.txt"), 'w') do |f|
|
40
|
+
@gathered_data[type].each { |match| f.puts match }
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'csv'
|
3
|
+
|
4
|
+
module WebMole
|
5
|
+
class OutputFormatter
|
6
|
+
def initialize(format = nil, output = nil, save_source_url = false)
|
7
|
+
@format = format
|
8
|
+
@output = output
|
9
|
+
@save_source_url = save_source_url
|
10
|
+
end
|
11
|
+
|
12
|
+
def format_results(matches)
|
13
|
+
if @output
|
14
|
+
save_results(matches)
|
15
|
+
else
|
16
|
+
print_results(matches)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def print_results(matches)
|
23
|
+
puts "Found matches:".colorize(:green)
|
24
|
+
matches.each do |match, urls|
|
25
|
+
if @save_source_url
|
26
|
+
urls.each { |url| puts "#{match} - #{url}".colorize(:cyan) }
|
27
|
+
else
|
28
|
+
puts match.colorize(:cyan)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
puts "Total matches found: #{matches.size}".colorize(:yellow)
|
32
|
+
end
|
33
|
+
|
34
|
+
def save_results(matches)
|
35
|
+
case @format
|
36
|
+
when 'txt'
|
37
|
+
save_as_txt(matches)
|
38
|
+
when 'yaml'
|
39
|
+
save_as_yaml(matches)
|
40
|
+
when 'csv'
|
41
|
+
save_as_csv(matches)
|
42
|
+
else
|
43
|
+
puts "Unsupported format: #{@format}".colorize(:red)
|
44
|
+
end
|
45
|
+
puts "Total matches found: #{matches.size}".colorize(:yellow)
|
46
|
+
end
|
47
|
+
|
48
|
+
def save_as_txt(matches)
|
49
|
+
File.open(@output, 'w') do |file|
|
50
|
+
file.puts "Found matches:"
|
51
|
+
matches.each do |match, urls|
|
52
|
+
if @save_source_url
|
53
|
+
urls.each { |url| file.puts "#{match} - #{url}" }
|
54
|
+
else
|
55
|
+
file.puts match
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
puts "Results saved to #{@output}".colorize(:green)
|
60
|
+
end
|
61
|
+
|
62
|
+
def save_as_yaml(matches)
|
63
|
+
File.open(@output, 'w') do |file|
|
64
|
+
data = @save_source_url ? matches : matches.keys
|
65
|
+
file.write({ matches: data }.to_yaml)
|
66
|
+
end
|
67
|
+
puts "Results saved to #{@output}".colorize(:green)
|
68
|
+
end
|
69
|
+
|
70
|
+
def save_as_csv(matches)
|
71
|
+
CSV.open(@output, 'w') do |csv|
|
72
|
+
csv << (@save_source_url ? ['Match', 'Source URL'] : ['Match'])
|
73
|
+
matches.each do |match, urls|
|
74
|
+
if @save_source_url
|
75
|
+
urls.each { |url| csv << [match, url] }
|
76
|
+
else
|
77
|
+
csv << [match]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
puts "Results saved to #{@output}".colorize(:green)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require_relative 'gathering_mode'
|
2
|
+
|
3
|
+
module WebMole
|
4
|
+
class Scraper
|
5
|
+
attr_reader :matches
|
6
|
+
|
7
|
+
def initialize(scrape_option, pattern, gathering_mode = false)
|
8
|
+
@scrape_option = scrape_option
|
9
|
+
@pattern = pattern ? Regexp.new(pattern) : nil
|
10
|
+
@matches = Hash.new { |h, k| h[k] = Set.new }
|
11
|
+
@gathering_mode = gathering_mode
|
12
|
+
end
|
13
|
+
|
14
|
+
def scrape(text, url)
|
15
|
+
case @scrape_option
|
16
|
+
when :emails
|
17
|
+
find_emails(text, url)
|
18
|
+
when :phone_numbers
|
19
|
+
find_phone_numbers(text, url)
|
20
|
+
when :urls
|
21
|
+
find_urls(text, url)
|
22
|
+
when :social_media
|
23
|
+
find_social_media(text, url)
|
24
|
+
when :addresses
|
25
|
+
find_addresses(text, url)
|
26
|
+
when :credit_cards
|
27
|
+
find_credit_cards(text, url)
|
28
|
+
when :custom
|
29
|
+
find_matches(text, url)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def find_emails(text, url)
|
36
|
+
matches = text.scan(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/)
|
37
|
+
add_matches(matches, url, 'emails')
|
38
|
+
end
|
39
|
+
|
40
|
+
def find_phone_numbers(text, url)
|
41
|
+
matches = text.scan(/\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/)
|
42
|
+
add_matches(matches, url, 'phone_numbers')
|
43
|
+
end
|
44
|
+
|
45
|
+
def find_urls(text, url)
|
46
|
+
matches = text.scan(/https?:\/\/[\S]+/)
|
47
|
+
add_matches(matches, url, 'urls')
|
48
|
+
end
|
49
|
+
|
50
|
+
def find_social_media(text, url)
|
51
|
+
matches = text.scan(/@[\w]+/)
|
52
|
+
add_matches(matches, url, 'social_media')
|
53
|
+
end
|
54
|
+
|
55
|
+
def find_addresses(text, url)
|
56
|
+
matches = text.scan(/\d+\s+([^\d\n]+\s)+(St|Ave|Rd|Blvd|Dr|Lane|Way)\.?/i)
|
57
|
+
add_matches(matches.map(&:join), url, 'addresses')
|
58
|
+
end
|
59
|
+
|
60
|
+
def find_credit_cards(text, url)
|
61
|
+
matches = text.scan(/\b(?:\d{4}[-\s]?){3}\d{4}\b/)
|
62
|
+
add_matches(matches, url, 'credit_cards')
|
63
|
+
end
|
64
|
+
|
65
|
+
def find_matches(text, url)
|
66
|
+
matches = text.scan(@pattern)
|
67
|
+
add_matches(matches, url, 'custom')
|
68
|
+
end
|
69
|
+
|
70
|
+
def add_matches(matches, url, type)
|
71
|
+
matches.each do |match|
|
72
|
+
@matches[match].add(url)
|
73
|
+
GatheringMode.save_to_gathering(match, type) if @gathering_mode
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module WebMole
|
5
|
+
class UserAgentSwitcher
|
6
|
+
CONFIG_DIR = File.join(Dir.home, '.config', 'webmole')
|
7
|
+
CONFIG_FILE = File.join(CONFIG_DIR, 'user_agents.yml')
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
ensure_config_file_exists
|
11
|
+
@user_agents = YAML.load_file(CONFIG_FILE)
|
12
|
+
rescue StandardError => e
|
13
|
+
puts "Warning: Error loading user_agents.yml: #{e.message}. Using default user agent.".colorize(:yellow)
|
14
|
+
@user_agents = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36']
|
15
|
+
end
|
16
|
+
|
17
|
+
def random_user_agent
|
18
|
+
@user_agents.sample
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def ensure_config_file_exists
|
24
|
+
return if File.exist?(CONFIG_FILE)
|
25
|
+
|
26
|
+
FileUtils.mkdir_p(CONFIG_DIR)
|
27
|
+
File.write(CONFIG_FILE, default_user_agents.to_yaml)
|
28
|
+
puts "Created default user_agents.yml in #{CONFIG_FILE}".colorize(:green)
|
29
|
+
end
|
30
|
+
|
31
|
+
def default_user_agents
|
32
|
+
[
|
33
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
34
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
|
35
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
|
36
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
|
37
|
+
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1'
|
38
|
+
]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/webmole.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'uri'
|
4
|
+
require 'set'
|
5
|
+
require 'yaml'
|
6
|
+
require 'csv'
|
7
|
+
require 'colorize'
|
8
|
+
|
9
|
+
require_relative 'webmole/version'
|
10
|
+
require_relative 'webmole/crawler'
|
11
|
+
require_relative 'webmole/scraper'
|
12
|
+
require_relative 'webmole/output_formatter'
|
13
|
+
require_relative 'webmole/user_agent_switcher'
|
14
|
+
require_relative 'webmole/gathering_mode'
|
15
|
+
|
16
|
+
module WebMole
|
17
|
+
class Error < StandardError; end
|
18
|
+
|
19
|
+
def self.run(options)
|
20
|
+
crawler = Crawler.new(options)
|
21
|
+
crawler.crawl
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.print_banner
|
25
|
+
puts <<-'EOB'.colorize(:light_blue)
|
26
|
+
__ __ _ __ __ _
|
27
|
+
\ \ / /__| |__ | \/ | ___ | | ___
|
28
|
+
\ \ /\ / / _ \ '_ \| |\/| |/ _ \| |/ _ \
|
29
|
+
\ V V / __/ |_) | | | | (_) | | __/
|
30
|
+
\_/\_/ \___|_.__/|_| |_|\___/|_|\___|
|
31
|
+
|
32
|
+
EOB
|
33
|
+
puts "WebMole v#{VERSION}".colorize(:light_cyan)
|
34
|
+
puts "A relatively powerful web scraper.".colorize(:light_green)
|
35
|
+
puts
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.print_disclaimer
|
39
|
+
puts "DISCLAIMER:".colorize(:yellow)
|
40
|
+
puts <<-EOD.colorize(:light_yellow)
|
41
|
+
This tool is for educational and ethical use only. The user bears all responsibility
|
42
|
+
for ensuring compliance with applicable laws, regulations, and website terms of service.
|
43
|
+
Misuse of this tool may be illegal and/or unethical. Always obtain proper authorization
|
44
|
+
before scraping any website.
|
45
|
+
|
46
|
+
Contact: Subnetmasked <subnetmasked@cock.li>
|
47
|
+
|
48
|
+
By using this tool, you agree to these terms and conditions.
|
49
|
+
EOD
|
50
|
+
end
|
51
|
+
end
|
data/webmole.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'webmole/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "webmole"
|
7
|
+
spec.version = WebMole::VERSION
|
8
|
+
spec.authors = ["Subnetmasked"]
|
9
|
+
spec.email = ["subnetmasked@cock.li"]
|
10
|
+
spec.summary = %q{A powerful web scraper}
|
11
|
+
spec.description = %q{WebMole is a Ruby-based web scraper with multiple features including email extraction, phone number scraping, and more.}
|
12
|
+
spec.homepage = "https://github.com/subnetmasked/webmole"
|
13
|
+
spec.license = "MIT"
|
14
|
+
spec.required_ruby_version = '>= 3.0'
|
15
|
+
|
16
|
+
spec.files = Dir['lib/**/*', 'bin/*', 'LICENSE.txt', '*.md', 'webmole.gemspec']
|
17
|
+
spec.bindir = "bin"
|
18
|
+
spec.executables = ["webmole"]
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "nokogiri", "~> 1.11"
|
22
|
+
spec.add_dependency "colorize", "~> 0.8"
|
23
|
+
spec.add_dependency "optparse", "~> 0.5.0"
|
24
|
+
spec.add_dependency "uri", "~> 0.13.1"
|
25
|
+
spec.add_dependency "csv", "~> 3.2"
|
26
|
+
|
27
|
+
spec.add_development_dependency "bundler", "~> 2.0"
|
28
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
29
|
+
spec.add_development_dependency "rspec", "~> 3.10"
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: webmole
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Subnetmasked
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-10-16 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.11'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.11'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: colorize
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.8'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0.8'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: optparse
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.5.0
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.5.0
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: uri
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.13.1
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.13.1
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: csv
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '3.2'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '3.2'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: bundler
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '2.0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '2.0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rake
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '13.0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '13.0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: rspec
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '3.10'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '3.10'
|
125
|
+
description: WebMole is a Ruby-based web scraper with multiple features including
|
126
|
+
email extraction, phone number scraping, and more.
|
127
|
+
email:
|
128
|
+
- subnetmasked@cock.li
|
129
|
+
executables:
|
130
|
+
- webmole
|
131
|
+
extensions: []
|
132
|
+
extra_rdoc_files: []
|
133
|
+
files:
|
134
|
+
- LICENSE.txt
|
135
|
+
- README.md
|
136
|
+
- bin/webmole
|
137
|
+
- lib/webmole.rb
|
138
|
+
- lib/webmole/crawler.rb
|
139
|
+
- lib/webmole/gathering_mode.rb
|
140
|
+
- lib/webmole/output_formatter.rb
|
141
|
+
- lib/webmole/scraper.rb
|
142
|
+
- lib/webmole/user_agent_switcher.rb
|
143
|
+
- lib/webmole/version.rb
|
144
|
+
- webmole.gemspec
|
145
|
+
homepage: https://github.com/subnetmasked/webmole
|
146
|
+
licenses:
|
147
|
+
- MIT
|
148
|
+
metadata: {}
|
149
|
+
post_install_message:
|
150
|
+
rdoc_options: []
|
151
|
+
require_paths:
|
152
|
+
- lib
|
153
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
154
|
+
requirements:
|
155
|
+
- - ">="
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '3.0'
|
158
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
159
|
+
requirements:
|
160
|
+
- - ">="
|
161
|
+
- !ruby/object:Gem::Version
|
162
|
+
version: '0'
|
163
|
+
requirements: []
|
164
|
+
rubygems_version: 3.5.21
|
165
|
+
signing_key:
|
166
|
+
specification_version: 4
|
167
|
+
summary: A powerful web scraper
|
168
|
+
test_files: []
|