url-vi0lence 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1 @@
1
+ This is for ripping shit off.
@@ -0,0 +1,2 @@
1
+ #! /usr/bin/env ruby
2
+ puts "wee"
@@ -0,0 +1,24 @@
1
+ require "rubygems"
2
+ require "mechanize"
3
+ require "fileutils"
4
+ require "uri"
5
+
6
+ begin
7
+ require "ruby-debug"
8
+ Debugger.start
9
+ Debugger.settings[:autoeval] = true if Debugger.respond_to?(:settings)
10
+ rescue LoadError
11
+ puts "You need to install ruby-debug to run the server in debugging mode. With gems, use 'gem install ruby-debug'"
12
+ exit
13
+ end
14
+
15
+ Dir.glob("./lib/url_vi0lence/**/*.rb").each do |file|
16
+ require file
17
+ end
18
+
19
+ activate_samples = {
20
+ "sg" => false,
21
+ "gmail" => false
22
+ }.each do |file,do_start|
23
+ require "./scrapers/#{file}_scraper.rb" if do_start
24
+ end
@@ -0,0 +1,152 @@
1
+ class PageScraper
2
+
3
+ =begin
4
+ # initialize
5
+ #
6
+ # parameters
7
+ # === user_agent <String> User agent alias (Google it.) (Default: Mac Firefox)
8
+ # === redirect <Boolean> Should redirects be followed (Default: true)
9
+ # === use_cookies <Boolean> Should cookies be used (Default: true)
10
+ # === cookie_jar <String> Path to YAML cookie jar (Default: ./dump/cookies.yml)
11
+ =end
12
+ def initialize(config={})
13
+ config = {
14
+ :user_agent => "Mac FireFox",
15
+ :redirect => true,
16
+ :use_cookies=> true,
17
+ :cookie_jar => "#{PageScraper.dump}/cookies.yml"
18
+ }.merge(config)
19
+
20
+ @agent = WWW::Mechanize.new
21
+ @agent.user_agent_alias = config[:user_agent]
22
+ @agent.redirect_ok = config[:redirect]
23
+
24
+ if config[:use_cookies]
25
+ @agent.cookie_jar.save_as(config[:cookie_jar])
26
+ end
27
+ end
28
+
29
+ =begin
30
+ # login
31
+ #
32
+ # parameters
33
+ # = username <String> Username
34
+ # = password <String> Password
35
+ # = url <String> Login page URL
36
+ # = *args <Array>
37
+ # === un_field <String> Username HTML field name (Default: username)
38
+ # === pw_field <String> Password HTML field name (Default: password)
39
+ # === form_id <String|Fixnum> Form HTML name or the sequential number of the form in the page
40
+ # = block <Proc> Additional actions to do after login
41
+ # => Block should return Page or nil
42
+ =end
43
+ def login(username,password,url,*args,&block)
44
+ un_field = args[0] || "username"
45
+ pw_field = args[1] || "password"
46
+ form_id = args[2] || 0 #First form
47
+
48
+ current_page(@agent.get(url))
49
+
50
+ if form_id.is_a? Fixnum
51
+ form = @page.forms[form_id]
52
+ else #form_id.is_a? String
53
+ form = @page.forms.find{|f| f.name == form_id.to_s}
54
+ end
55
+
56
+ form.fields.find {|f| f.name == un_field}.value = username
57
+ form.fields.find {|f| f.name == pw_field}.value = password
58
+
59
+ current_page(@agent.submit(form))
60
+
61
+ current_page(block.call({
62
+ :username => username,
63
+ :password => password,
64
+ :un_field => un_field,
65
+ :pw_field => pw_field,
66
+ :form_id => form_id,
67
+ :agent => @agent,
68
+ :page => @page
69
+ })) if block_given?
70
+
71
+ return current_page
72
+ rescue Exception => e
73
+ puts "There was an exception during #login:\n#{e}"
74
+ Kernel.exit
75
+ end
76
+
77
+ =begin
78
+ # current_page - Current page Mechanize is on
79
+ #
80
+ # parameters
81
+ # new_page <Mechanize::Page> Page to set current page to (Default nil)
82
+ =end
83
+ def current_page(new_page=nil)
84
+ @page = new_page unless new_page.nil?
85
+ @page
86
+ end
87
+
88
+ =begin
89
+ # parse - Executes a block against the site
90
+ #
91
+ # parameters
92
+ # = pages <Array> List of urls to execute the block against (Default = current_page)
93
+ # = block <Proc> Action that should be performed
94
+ # => Access will be available to agent and page
95
+ # => Block should return Page || nil
96
+ =end
97
+ def parse(pages=nil,&block)
98
+ pages = [current_page] if pages.nil?
99
+
100
+ #Turn strings into Mechanize::Page
101
+ pages.collect! do |page|
102
+ if page.is_a? String
103
+ page = @agent.get(page)
104
+ end
105
+ end
106
+ debugger
107
+ pages.collect {|page|
108
+ current_page(block.call(page,@agent))
109
+ }
110
+ end
111
+
112
+ =begin
113
+ # filter_links - Filters links based on a regex match
114
+ #
115
+ # parameters
116
+ # = match_pattern <Regex> Pattern of links to keep
117
+ #TODO, Im sure there is a cleaner implementation
118
+ =end
119
+ def filter_links(match_pattern)
120
+ links = []
121
+ current_page.links.each do |link|
122
+ if link.uri.to_s =~ match_pattern
123
+ links.push(link)
124
+ end
125
+ end
126
+
127
+ @links.uniq!
128
+ rescue Exception => e
129
+ #Nothing!
130
+ ensure
131
+ links || []
132
+ end
133
+
134
+ class << self
135
+ def dump
136
+ @@dump = "./dump"
137
+ FileUtils.mkdir_p(@@dump)
138
+ @@dump
139
+ end
140
+
141
+ private
142
+ def children
143
+ @@children ||= []
144
+ end
145
+
146
+ def inherited(klass)
147
+ super
148
+ ensure
149
+ children << klass
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,32 @@
1
+ class GmailScraper < PageScraper
2
+ NewInboxMessages = Proc.new {|page, agent|
3
+ page.search("//tr[@bgcolor='#ffffff']") do |row|
4
+ from, subject = *row.search("//b/text()")
5
+ url = page.uri.to_s.sub(/ui.*$/, row.search("//a").first.attributes["href"])
6
+ puts "From: #{from}\nSubject: #{subject}\nLink: #{url}\n\n"
7
+ end
8
+ }
9
+ end
10
+
11
+ gmail = GmailScraper.new
12
+
13
+ gmail_u = ""
14
+ gmail_p = ""
15
+
16
+ gmail.login gmail_u,gmail_p,"http://www.gmail.com","Email","Passwd" do |info|
17
+ #This tells Gmail that we want to use the basic, no-js version
18
+ info[:agent].get info[:page].uri.to_s.sub(/\?.*$/, "?ui=html&zy=n")
19
+ end
20
+
21
+
22
+ #you could pass it a block or a proc
23
+
24
+ #gmail.parse do |page, agent|
25
+ # page.search("//tr[@bgcolor='#ffffff']") do |row|
26
+ # from, subject = *row.search("//b/text()")
27
+ # url = page.uri.to_s.sub(/ui.*$/, row.search("//a").first.attributes["href"])
28
+ # puts "From: #{from}\nSubject: #{subject}\nLink: #{url}\n\n"
29
+ # end
30
+ #end
31
+
32
+ gmail.parse &GmailScraper::NewInboxMessages
@@ -0,0 +1,53 @@
1
+ class SgScraper < PageScraper
2
+ ImageGanker = lambda { |page, agent|
3
+ begin
4
+ hrefs = page.links.map { |m|
5
+ m.href
6
+ }.select { |u|
7
+ u =~ /[0-9]+.jpg/
8
+ } #just jpgs with numbers in the name
9
+
10
+ create_folder = true
11
+
12
+ hrefs.each { |image|
13
+ image_name = image.split("/")
14
+
15
+ #Folder name = GirlName/PhotoSet
16
+ folder = "#{SgScraper.dump}/sg/#{image_name[-4]}/#{image_name[-2]}"
17
+
18
+ if create_folder
19
+ FileUtils.mkdir_p(folder)
20
+ end
21
+
22
+ filename = "#{folder}/#{image_name[-1]}"
23
+ puts "Saving #{image} as #{filename}"
24
+ agent.get(image).save_as(filename)
25
+
26
+ create_folder = false
27
+ }
28
+ rescue Exception => e
29
+ puts e
30
+ puts 'Failed to get a file boo hoo'
31
+ end
32
+ }
33
+
34
+ end
35
+
36
+
37
+ sg = SgScraper.new
38
+
39
+ sg_u = ""
40
+ sg_p = ""
41
+
42
+ sg.login sg_u,sg_p,"http://www.suicidegirls.com" do |info|
43
+ #Go to a particular page after login
44
+ info[:agent].get "http://suicidegirls.com/girls/Ren/photos/Mars+Attacks/"
45
+ end
46
+
47
+ urls=[
48
+ "http://suicidegirls.com/girls/Ren/photos/Forest+Tea/",
49
+ "http://suicidegirls.com/girls/Gatsby/photos/Trinkets/",
50
+ "http://suicidegirls.com/members/Gatsby/albums/site/3578/"
51
+ ]
52
+ sg.parse(urls, &SgScraper::ImageGanker)
53
+
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: url-vi0lence
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Cory ODaniel
8
+ autorequire: url_vi0lence
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-04-29 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: urlviolence@coryodaniel.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - README
24
+ files:
25
+ - bin/url-vi0lence
26
+ - lib/url-vi0lence.rb
27
+ - lib/url_vi0lence
28
+ - lib/url_vi0lence/page_scraper.rb
29
+ - scrapers/gmail_scraper.rb
30
+ - scrapers/sg_scraper.rb
31
+ - README
32
+ has_rdoc: true
33
+ homepage: url-vi0lence.rubyforge.com
34
+ post_install_message:
35
+ rdoc_options: []
36
+
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ required_rubygems_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: "0"
50
+ version:
51
+ requirements: []
52
+
53
+ rubyforge_project:
54
+ rubygems_version: 1.0.1
55
+ signing_key:
56
+ specification_version: 2
57
+ summary: Ripping off other peoples hard work, or automating testing whatev. Built on Mechanize
58
+ test_files: []
59
+