url-vi0lence 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +1 -0
- data/bin/url-vi0lence +2 -0
- data/lib/url-vi0lence.rb +24 -0
- data/lib/url_vi0lence/page_scraper.rb +152 -0
- data/scrapers/gmail_scraper.rb +32 -0
- data/scrapers/sg_scraper.rb +53 -0
- metadata +59 -0
data/README
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
This is for ripping shit off.
|
data/bin/url-vi0lence
ADDED
data/lib/url-vi0lence.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "mechanize"
|
3
|
+
require "fileutils"
|
4
|
+
require "uri"
|
5
|
+
|
6
|
+
begin
|
7
|
+
require "ruby-debug"
|
8
|
+
Debugger.start
|
9
|
+
Debugger.settings[:autoeval] = true if Debugger.respond_to?(:settings)
|
10
|
+
rescue LoadError
|
11
|
+
puts "You need to install ruby-debug to run the server in debugging mode. With gems, use 'gem install ruby-debug'"
|
12
|
+
exit
|
13
|
+
end
|
14
|
+
|
15
|
+
Dir.glob("./lib/url_vi0lence/**/*.rb").each do |file|
|
16
|
+
require file
|
17
|
+
end
|
18
|
+
|
19
|
+
activate_samples = {
|
20
|
+
"sg" => false,
|
21
|
+
"gmail" => false
|
22
|
+
}.each do |file,do_start|
|
23
|
+
require "./scrapers/#{file}_scraper.rb" if do_start
|
24
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
class PageScraper
|
2
|
+
|
3
|
+
=begin
|
4
|
+
# initialize
|
5
|
+
#
|
6
|
+
# parameters
|
7
|
+
# === user_agent <String> User agent alias (Google it.) (Default: Mac Firefox)
|
8
|
+
# === redirect <Boolean> Should redirects be followed (Default: true)
|
9
|
+
# === use_cookies <Boolean> Should cookies be used (Default: true)
|
10
|
+
# === cookie_jar <String> Path to YAML cookie jar (Default: ./dump/cookies.yml)
|
11
|
+
=end
|
12
|
+
def initialize(config={})
|
13
|
+
config = {
|
14
|
+
:user_agent => "Mac FireFox",
|
15
|
+
:redirect => true,
|
16
|
+
:use_cookies=> true,
|
17
|
+
:cookie_jar => "#{PageScraper.dump}/cookies.yml"
|
18
|
+
}.merge(config)
|
19
|
+
|
20
|
+
@agent = WWW::Mechanize.new
|
21
|
+
@agent.user_agent_alias = config[:user_agent]
|
22
|
+
@agent.redirect_ok = config[:redirect]
|
23
|
+
|
24
|
+
if config[:use_cookies]
|
25
|
+
@agent.cookie_jar.save_as(config[:cookie_jar])
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
=begin
|
30
|
+
# login
|
31
|
+
#
|
32
|
+
# parameters
|
33
|
+
# = username <String> Username
|
34
|
+
# = password <String> Password
|
35
|
+
# = url <String> Login page URL
|
36
|
+
# = *args <Array>
|
37
|
+
# === un_field <String> Username HTML field name (Default: username)
|
38
|
+
# === pw_field <String> Password HTML field name (Default: password)
|
39
|
+
# === form_id <String|Fixnum> Form HTML name or the sequential number of the form in the page
|
40
|
+
# = block <Proc> Additional actions to do after login
|
41
|
+
# => Block should return Page or nil
|
42
|
+
=end
|
43
|
+
def login(username,password,url,*args,&block)
|
44
|
+
un_field = args[0] || "username"
|
45
|
+
pw_field = args[1] || "password"
|
46
|
+
form_id = args[2] || 0 #First form
|
47
|
+
|
48
|
+
current_page(@agent.get(url))
|
49
|
+
|
50
|
+
if form_id.is_a? Fixnum
|
51
|
+
form = @page.forms[form_id]
|
52
|
+
else #form_id.is_a? String
|
53
|
+
form = @page.forms.find{|f| f.name == form_id.to_s}
|
54
|
+
end
|
55
|
+
|
56
|
+
form.fields.find {|f| f.name == un_field}.value = username
|
57
|
+
form.fields.find {|f| f.name == pw_field}.value = password
|
58
|
+
|
59
|
+
current_page(@agent.submit(form))
|
60
|
+
|
61
|
+
current_page(block.call({
|
62
|
+
:username => username,
|
63
|
+
:password => password,
|
64
|
+
:un_field => un_field,
|
65
|
+
:pw_field => pw_field,
|
66
|
+
:form_id => form_id,
|
67
|
+
:agent => @agent,
|
68
|
+
:page => @page
|
69
|
+
})) if block_given?
|
70
|
+
|
71
|
+
return current_page
|
72
|
+
rescue Exception => e
|
73
|
+
puts "There was an exception during #login:\n#{e}"
|
74
|
+
Kernel.exit
|
75
|
+
end
|
76
|
+
|
77
|
+
=begin
|
78
|
+
# current_page - Current page Mechanize is on
|
79
|
+
#
|
80
|
+
# parameters
|
81
|
+
# new_page <Mechanize::Page> Page to set current page to (Default nil)
|
82
|
+
=end
|
83
|
+
def current_page(new_page=nil)
|
84
|
+
@page = new_page unless new_page.nil?
|
85
|
+
@page
|
86
|
+
end
|
87
|
+
|
88
|
+
=begin
|
89
|
+
# parse - Executes a block against the site
|
90
|
+
#
|
91
|
+
# parameters
|
92
|
+
# = pages <Array> List of urls to execute the block against (Default = current_page)
|
93
|
+
# = block <Proc> Action that should be performed
|
94
|
+
# => Access will be available to agent and page
|
95
|
+
# => Block should return Page || nil
|
96
|
+
=end
|
97
|
+
def parse(pages=nil,&block)
|
98
|
+
pages = [current_page] if pages.nil?
|
99
|
+
|
100
|
+
#Turn strings into Mechanize::Page
|
101
|
+
pages.collect! do |page|
|
102
|
+
if page.is_a? String
|
103
|
+
page = @agent.get(page)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
debugger
|
107
|
+
pages.collect {|page|
|
108
|
+
current_page(block.call(page,@agent))
|
109
|
+
}
|
110
|
+
end
|
111
|
+
|
112
|
+
=begin
|
113
|
+
# filter_links - Filters links based on a regex match
|
114
|
+
#
|
115
|
+
# parameters
|
116
|
+
# = match_pattern <Regex> Pattern of links to keep
|
117
|
+
#TODO, Im sure there is a cleaner implementation
|
118
|
+
=end
|
119
|
+
def filter_links(match_pattern)
|
120
|
+
links = []
|
121
|
+
current_page.links.each do |link|
|
122
|
+
if link.uri.to_s =~ match_pattern
|
123
|
+
links.push(link)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
@links.uniq!
|
128
|
+
rescue Exception => e
|
129
|
+
#Nothing!
|
130
|
+
ensure
|
131
|
+
links || []
|
132
|
+
end
|
133
|
+
|
134
|
+
class << self
|
135
|
+
def dump
|
136
|
+
@@dump = "./dump"
|
137
|
+
FileUtils.mkdir_p(@@dump)
|
138
|
+
@@dump
|
139
|
+
end
|
140
|
+
|
141
|
+
private
|
142
|
+
def children
|
143
|
+
@@children ||= []
|
144
|
+
end
|
145
|
+
|
146
|
+
def inherited(klass)
|
147
|
+
super
|
148
|
+
ensure
|
149
|
+
children << klass
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
class GmailScraper < PageScraper
|
2
|
+
NewInboxMessages = Proc.new {|page, agent|
|
3
|
+
page.search("//tr[@bgcolor='#ffffff']") do |row|
|
4
|
+
from, subject = *row.search("//b/text()")
|
5
|
+
url = page.uri.to_s.sub(/ui.*$/, row.search("//a").first.attributes["href"])
|
6
|
+
puts "From: #{from}\nSubject: #{subject}\nLink: #{url}\n\n"
|
7
|
+
end
|
8
|
+
}
|
9
|
+
end
|
10
|
+
|
11
|
+
gmail = GmailScraper.new
|
12
|
+
|
13
|
+
gmail_u = ""
|
14
|
+
gmail_p = ""
|
15
|
+
|
16
|
+
gmail.login gmail_u,gmail_p,"http://www.gmail.com","Email","Passwd" do |info|
|
17
|
+
#This tells Gmail that we want to use the basic, no-js version
|
18
|
+
info[:agent].get info[:page].uri.to_s.sub(/\?.*$/, "?ui=html&zy=n")
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
#you could pass it a block or a proc
|
23
|
+
|
24
|
+
#gmail.parse do |page, agent|
|
25
|
+
# page.search("//tr[@bgcolor='#ffffff']") do |row|
|
26
|
+
# from, subject = *row.search("//b/text()")
|
27
|
+
# url = page.uri.to_s.sub(/ui.*$/, row.search("//a").first.attributes["href"])
|
28
|
+
# puts "From: #{from}\nSubject: #{subject}\nLink: #{url}\n\n"
|
29
|
+
# end
|
30
|
+
#end
|
31
|
+
|
32
|
+
gmail.parse &GmailScraper::NewInboxMessages
|
@@ -0,0 +1,53 @@
|
|
1
|
+
class SgScraper < PageScraper
|
2
|
+
ImageGanker = lambda { |page, agent|
|
3
|
+
begin
|
4
|
+
hrefs = page.links.map { |m|
|
5
|
+
m.href
|
6
|
+
}.select { |u|
|
7
|
+
u =~ /[0-9]+.jpg/
|
8
|
+
} #just jpgs with numbers in the name
|
9
|
+
|
10
|
+
create_folder = true
|
11
|
+
|
12
|
+
hrefs.each { |image|
|
13
|
+
image_name = image.split("/")
|
14
|
+
|
15
|
+
#Folder name = GirlName/PhotoSet
|
16
|
+
folder = "#{SgScraper.dump}/sg/#{image_name[-4]}/#{image_name[-2]}"
|
17
|
+
|
18
|
+
if create_folder
|
19
|
+
FileUtils.mkdir_p(folder)
|
20
|
+
end
|
21
|
+
|
22
|
+
filename = "#{folder}/#{image_name[-1]}"
|
23
|
+
puts "Saving #{image} as #{filename}"
|
24
|
+
agent.get(image).save_as(filename)
|
25
|
+
|
26
|
+
create_folder = false
|
27
|
+
}
|
28
|
+
rescue Exception => e
|
29
|
+
puts e
|
30
|
+
puts 'Failed to get a file boo hoo'
|
31
|
+
end
|
32
|
+
}
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
sg = SgScraper.new
|
38
|
+
|
39
|
+
sg_u = ""
|
40
|
+
sg_p = ""
|
41
|
+
|
42
|
+
sg.login sg_u,sg_p,"http://www.suicidegirls.com" do |info|
|
43
|
+
#Go to a particular page after login
|
44
|
+
info[:agent].get "http://suicidegirls.com/girls/Ren/photos/Mars+Attacks/"
|
45
|
+
end
|
46
|
+
|
47
|
+
urls=[
|
48
|
+
"http://suicidegirls.com/girls/Ren/photos/Forest+Tea/",
|
49
|
+
"http://suicidegirls.com/girls/Gatsby/photos/Trinkets/",
|
50
|
+
"http://suicidegirls.com/members/Gatsby/albums/site/3578/"
|
51
|
+
]
|
52
|
+
sg.parse(urls, &SgScraper::ImageGanker)
|
53
|
+
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: url-vi0lence
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Cory ODaniel
|
8
|
+
autorequire: url_vi0lence
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-04-29 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description:
|
17
|
+
email: urlviolence@coryodaniel.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README
|
24
|
+
files:
|
25
|
+
- bin/url-vi0lence
|
26
|
+
- lib/url-vi0lence.rb
|
27
|
+
- lib/url_vi0lence
|
28
|
+
- lib/url_vi0lence/page_scraper.rb
|
29
|
+
- scrapers/gmail_scraper.rb
|
30
|
+
- scrapers/sg_scraper.rb
|
31
|
+
- README
|
32
|
+
has_rdoc: true
|
33
|
+
homepage: url-vi0lence.rubyforge.com
|
34
|
+
post_install_message:
|
35
|
+
rdoc_options: []
|
36
|
+
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "0"
|
50
|
+
version:
|
51
|
+
requirements: []
|
52
|
+
|
53
|
+
rubyforge_project:
|
54
|
+
rubygems_version: 1.0.1
|
55
|
+
signing_key:
|
56
|
+
specification_version: 2
|
57
|
+
summary: Ripping off other peoples hard work, or automating testing whatev. Built on Mechanize
|
58
|
+
test_files: []
|
59
|
+
|