FreedomCoder-esearchy 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc ADDED
@@ -0,0 +1,81 @@
1
+ = Esearch
2
+
3
+ == DESCRIPTION
4
+ Esearchy is a small library capable of searching the internet for email addresses. Currently, we the supported search methods are engines such as Google, Bing, Yahoo, PGP servers, GoogleGroups, etc , but I intend to add many more.
5
+
6
+ Also, the library searches inside .pdf, .docx, .xlsx, .pptx, asn and .txt files for emails addresses and adds them to the list of found accounts. Finally, we have support for .docs files but for now only in Windows Platforms.
7
+
8
+ NOTE: In order to work Bing and Yahoo need an appid, for which you will have to create one for each and place them in fles so the library will be able to work properly.
9
+ * data/yahoo.key
10
+ * data/bing.key
11
+
12
+ Soon, users should be able to also pass them as parameters.
13
+
14
+ == SUPPORT:
15
+
16
+ * http://github.com/FreedomCoder/esearchy/issues
17
+ * Emails from github.
18
+
19
+ == SYNOPSIS:
20
+
21
+ For now, there are two main ways of performing a search:
22
+
23
+ * Executable CLI command
24
+
25
+ esearchy --domain domain.com --maxhits 500 --yahoo_key dkajsdkajskdad --output "~/emails.txt"
26
+
27
+ * Library
28
+
29
+ For thouse who want to integrate this to their application you can use it in "the ruby way"
30
+
31
+ Esearchy.create "domain.com" do |domain|
32
+ domain.maxhits = 500
33
+ domain.search
34
+ domain.clean {|e| e =~ /<|>/ }
35
+ domain.save_to_file "~/emails.txt"
36
+ end
37
+
38
+ or in the more classic way in which you can create an Esearchy objetc and work on it
39
+
40
+ domain = Esearchy.new :query => "domain.com", :maxhits => 500
41
+ domain.search
42
+ domain.save_to_file "~/emails.txt"
43
+
44
+ == REQUIREMENTS:
45
+
46
+ * ruby 1.8 or 1.9
47
+ * cgi
48
+ * pdf/reader
49
+ * json
50
+
51
+ == INSTALL:
52
+
53
+ * sudo gem install freedomcoder-esearchy
54
+
55
+ == LICENSE:
56
+
57
+ (The MIT License)
58
+
59
+ Copyright (c) 2008 - 2009:
60
+
61
+ * {Matias P. Brutti}[http://www.freedomcoder.com.ar]
62
+
63
+
64
+ Permission is hereby granted, free of charge, to any person obtaining
65
+ a copy of this software and associated documentation files (the
66
+ 'Software'), to deal in the Software without restriction, including
67
+ without limitation the rights to use, copy, modify, merge, publish,
68
+ distribute, sublicense, and/or sell copies of the Software, and to
69
+ permit persons to whom the Software is furnished to do so, subject to
70
+ the following conditions:
71
+
72
+ The above copyright notice and this permission notice shall be
73
+ included in all copies or substantial portions of the Software.
74
+
75
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
76
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
77
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
78
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
79
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
80
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
81
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/bin/esearchy ADDED
@@ -0,0 +1,78 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'rubygems'
4
+ require 'getoptlong'
5
+ require 'esearchy'
6
+
7
+ @yahoo_key = nil
8
+ @bing_key = nil
9
+ @maxhits = nil
10
+ @domains = []
11
+ @output = nil
12
+
13
+ opts = GetoptLong.new(
14
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
15
+ ['--domain','-d', GetoptLong::REQUIRED_ARGUMENT ],
16
+ ['--file','-f', GetoptLong::REQUIRED_ARGUMENT ],
17
+ ['--filter','-p', GetoptLong::REQUIRED_ARGUMENT ],
18
+ ['--output','-o', GetoptLong::REQUIRED_ARGUMENT ],
19
+ ['--yahoo_key','-y', GetoptLong::REQUIRED_ARGUMENT ],
20
+ ['--bing_key','-b', GetoptLong::REQUIRED_ARGUMENT ],
21
+ ['--maxhits','-m', GetoptLong::REQUIRED_ARGUMENT ]
22
+ )
23
+
24
+ opts.each do |opt, arg|
25
+ case opt
26
+ when '--help':
27
+ # BEGIN OF HELP
28
+ puts "\nHELP for Esearchy\n---------------------\n
29
+ --help, -h
30
+ \tWell I guess you know what this is for (To obtain this Help).\n
31
+ --domain, -d [domain.com]
32
+ \t The domain name to search.\n
33
+ --filter, -p
34
+ \t The pattern to use to filter emails.\n
35
+ --file, -f [file_name]
36
+ \tIf we need to search more than one domain we can provide a list.\n
37
+ --output, -o
38
+ \tThe output file name.
39
+ Copyright 2009 - FreedomCoder\n"
40
+ #END OF HELP
41
+ exit(0)
42
+ when '--domain':
43
+ @domains << arg
44
+ when '--file':
45
+ if File.exists?(arg)
46
+ open(arg,'r').each_line do |line|
47
+ @domains << line
48
+ end
49
+ else
50
+ puts "File not found"
51
+ end
52
+ when '--yahoo_key':
53
+ @yahoo_key = arg
54
+ when '--bing_key':
55
+ @bing_key = arg
56
+ when '--filter':
57
+ @pattern = arg
58
+ when '--output':
59
+ @output = arg
60
+ when '--maxhits':
61
+ @maxhits = arg
62
+ else
63
+ puts "Unknown command. Please try again"
64
+ exit(0)
65
+ end
66
+ end
67
+
68
+ require 'esearchy'
69
+
70
+ @domains.each do |domain|
71
+ ESearchy.create domain do |d|
72
+ d.yahoo_key = @yahoo_key if @yahoo_key
73
+ d.bing_key = @bing_key if @bing_key
74
+ d.maxhits = @maxhits if @maxhits
75
+ d.search
76
+ d.save_to_file @output if @output
77
+ end
78
+ end
data/data/bing.key ADDED
@@ -0,0 +1 @@
1
+ skdajsdksaldksadjlaksdlaskd
data/data/yahoo.key ADDED
@@ -0,0 +1 @@
1
+ dsjdkajsdasjdkasjdlkasjdsakdjskldjsdkaj
data/esearchy.rb ADDED
@@ -0,0 +1 @@
1
+ require 'lib/esearchy.rb'
@@ -0,0 +1,76 @@
1
+ %w{rubygems json cgi net/http}.each { |lib| require lib }
2
+ local_path = "#{File.dirname(__FILE__)}/"
3
+ %w{searchy keys}.each {|lib| require local_path + lib}
4
+
5
+ class Bing
6
+ include Searchy
7
+
8
+ def initialize(maxhits = nil, appid=nil, start=nil)
9
+ @appid = appid || Keys::BING_APP_KEY
10
+ @start = start || 0
11
+ @emails = []
12
+ @threads = []
13
+ @totalhits = maxhits || 0
14
+ @r_urls = Queue.new
15
+ @r_docs = Queue.new
16
+ @r_pdfs = Queue.new
17
+ @r_officexs = Queue.new
18
+ @r_txts = Queue.new
19
+ @lock = Mutex.new
20
+ end
21
+ attr_accessor :emails, :appid
22
+
23
+ def search(query)
24
+ @query = query
25
+ begin
26
+ http = Net::HTTP.new("api.search.live.net",80)
27
+ http.start do |http|
28
+ request = Net::HTTP::Get.new("/json.aspx" + "?Appid="+ @appid +
29
+ "&query=" + CGI.escape(query) +
30
+ "&sources=web&web.count=50&start=#{@start}")
31
+ response = http.request(request)
32
+ case response
33
+ when Net::HTTPSuccess, Net::HTTPRedirection
34
+ parse(response.body)
35
+ @start = @start + 50
36
+ if @totalhits > @start
37
+ puts "Searching in URL: #{self.class} up to point #{@start}"
38
+ search_emails(response.body)
39
+ sleep(4)
40
+ search(query)
41
+ else
42
+ puts "Searching in URL: #{self.class} up to point #{@start}"
43
+ search_emails(response.body)
44
+ end
45
+ else
46
+ return response.error!
47
+ end
48
+ end
49
+ rescue Net::HTTPFatalError
50
+ puts "Error: Something went wrong with the HTTP request"
51
+ rescue Errno::ECONNREFUSED
52
+ puts "Error: < Connection Refused > Hopefuly they have not banned us. :)"
53
+ end
54
+
55
+ end
56
+
57
+ def parse(json)
58
+ doc = JSON.parse(json)
59
+ @totalhits = doc["SearchResponse"]["Web"]["Total"].to_i if @totalhits == 0
60
+ doc["SearchResponse"]["Web"]["Results"].each do |result|
61
+ case result["Url"]
62
+ when /.pdf$/i
63
+ @r_pdfs << result["Url"]
64
+ when /.docx$|.xlsx$|.pptx$/i
65
+ @r_officexs << result["Url"]
66
+ when /.doc$/i
67
+ @r_docs << result["Url"]
68
+ when /.txt$|.rtf$|ans$/i
69
+ @r_txts << result["Url"]
70
+ else
71
+ @r_urls << result["Url"]
72
+ end
73
+ end
74
+ end
75
+
76
+ end
@@ -0,0 +1,85 @@
1
+ %w{rubygems cgi net/http}.each { |lib| require lib }
2
+ local_path = "#{File.dirname(__FILE__)}/"
3
+ %w{searchy keys}.each {|lib| require local_path + lib}
4
+
5
+ class Google
6
+ include Searchy
7
+
8
+ def initialize(maxhits = nil, start = nil)
9
+ @start = start || 0
10
+ @totalhits = maxhits || 0
11
+ @emails = []
12
+ @r_urls = Queue.new
13
+ @r_docs = Queue.new
14
+ @r_pdfs = Queue.new
15
+ @r_txts = Queue.new
16
+ @r_officexs = Queue.new
17
+ @lock = Mutex.new
18
+ @threads = []
19
+ end
20
+
21
+ attr_accessor :emails
22
+
23
+ def search(query)
24
+ @query = query
25
+ http = Net::HTTP.new("www.google.com",80)
26
+ begin
27
+ http.start do |http|
28
+ request = Net::HTTP::Get.new( "/cse?&safe=off&num=100&site=" +
29
+ "&q=" + CGI.escape(query) +
30
+ "&btnG=Search&start=#{@start}")
31
+ response = http.request(request)
32
+ case response
33
+ when Net::HTTPSuccess, Net::HTTPRedirection
34
+ parse(response.body)
35
+ @start = @start + 100
36
+ if @totalhits > @start
37
+ puts "Searching in URL: #{self.class} up to point #{@start}"
38
+ search_emails(response.body)
39
+ sleep(4)
40
+ search(query)
41
+ else
42
+ puts "Searching in URL: #{self.class} up to point #{@start}"
43
+ search_emails(response.body)
44
+ end
45
+ else
46
+ return response.error!
47
+ end
48
+ end
49
+ rescue Net::HTTPFatalError
50
+ puts "Error: Something went wrong with the HTTP request"
51
+ end
52
+ end
53
+
54
+ def parse(html)
55
+ @totalhits= html.scan(/<\/b> of about <b>(.*)<\/b> for /)[0][0].gsub(",","").to_i if @totalhits == 0
56
+ html.scan(/<div class=g><span class="b w xsm">\[([A-Z]+)\]<\/span> <h2 class=r><a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)"|<h2 class=r><a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)"/).each do |result|
57
+ case result[0]
58
+ when /PDF/
59
+ @r_pdfs << result[1]
60
+ when /DOC|XLS|PPT/
61
+ case result[1]
62
+ when /.doc$/i
63
+ @r_docs << result[1]
64
+ when /.docx$|.xlsx$|.pptx$/i
65
+ @r_officexs << result[1]
66
+ end
67
+ when nil
68
+ case result[2]
69
+ when /.pdf$/i
70
+ @r_pdfs << result[2]
71
+ when /.doc$/i
72
+ @r_docs << result[2]
73
+ when /.docx$|.xlsx$|.pptx$/i
74
+ @r_officexs << result[2]
75
+ when /.txt$|.rtf$|ans$/i
76
+ @r_txts << result[2]
77
+ else
78
+ @r_urls << result[2]
79
+ end
80
+ else
81
+ puts "I do not parse the #{result[0]} filetype yet:)"
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,69 @@
1
+ %w{rubygems cgi net/http}.each { |lib| require lib }
2
+ local_path = "#{File.dirname(__FILE__)}/"
3
+ %w{searchy keys}.each {|lib| require local_path + lib}
4
+
5
+ class GoogleGroups
6
+ include Searchy
7
+
8
+ def initialize(maxhits = nil, start = nil)
9
+ @start = start || 0
10
+ @totalhits = maxhits || 0
11
+ @r_urls = Queue.new
12
+ @r_docs = Queue.new
13
+ @r_pdfs = Queue.new
14
+ @r_txts = Queue.new
15
+ @r_officexs = Queue.new
16
+ @lock = Mutex.new
17
+ @threads = []
18
+ end
19
+ attr_accessor :emails
20
+
21
+ def search(query)
22
+ @query = query
23
+ http = Net::HTTP.new("groups.google.com",80)
24
+ begin
25
+ http.start do |http|
26
+ request = Net::HTTP::Get.new( "/groups/search?&safe=off&num=100" +
27
+ "&q=" + CGI.escape(query) +
28
+ "&btnG=Search&start=#{@start}")
29
+ response = http.request(request)
30
+ case response
31
+ when Net::HTTPSuccess, Net::HTTPRedirection
32
+ parse(response.body)
33
+ @start = @start + 100
34
+ if @totalhits > @start
35
+ puts "Searching in URL: #{self.class} up to point #{@start}"
36
+ search_emails(response.body)
37
+ sleep(4)
38
+ search(query)
39
+ else
40
+ puts "Searching in URL: #{self.class} up to point #{@start}"
41
+ search_emails(response.body)
42
+ end
43
+ else
44
+ return response.error!
45
+ end
46
+ end
47
+ rescue Net::HTTPFatalError
48
+ puts "Error: Something went wrong with the HTTP request"
49
+ end
50
+ end
51
+
52
+ def parse(html)
53
+ @totalhits= html.scan(/<\/b> of about <b>(.*)<\/b> for /)[0][0].gsub(",","").to_i if @totalhits == 0
54
+ html.scan(/<div class=g align="left"><a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)" target=""/).each do |result|
55
+ case result[0]
56
+ when /.pdf$/i
57
+ @r_pdfs << result[0]
58
+ when /.doc$/i
59
+ @r_docs << result[0]
60
+ when /.docx$|.xlsx$|.pptx$/i
61
+ @r_officexs << result[0]
62
+ when /.txt$|.asn$/i
63
+ @r_txts << result[0]
64
+ else
65
+ @r_urls << result[0]
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,5 @@
1
+ class Keys
2
+ BING_APP_KEY= open(File.dirname(__FILE__) + "/../../data/bing.key").readline.strip
3
+ YAHOO_APP_KEY= open(File.dirname(__FILE__) + "/../../data/yahoo.key").readline.strip
4
+ end
5
+
@@ -0,0 +1,76 @@
1
+ %w{rubygems cgi net/http}.each { |lib| require lib }
2
+ local_path = "#{File.dirname(__FILE__)}/"
3
+ %w{yahoo google}.each {|lib| require local_path + lib}
4
+
5
+ # http:///
6
+ class Linkedin
7
+ include Searchy
8
+
9
+ def initialize(maxhits=nil, start=nil)
10
+ @totalhits = maxhits || 0
11
+ @pages = 1
12
+ @emails = []
13
+ @lock = Mutex.new
14
+ @start = start || 0
15
+ @threads = []
16
+ @lock = Mutex.new
17
+ end
18
+ attr_accessor :emails, :appid
19
+
20
+ def search(query)
21
+ @query = query
22
+ begin
23
+ http = Net::HTTP.new("www.linkedin.com",80)
24
+ http.start do |http|
25
+ request = Net::HTTP::Get.new("search?search=&company=" + @query +
26
+ "&currentCompany=currentCompany" +
27
+ "&trk=coprofile_in_network_see_more" +
28
+ "&page_num=" + @pages)
29
+ response = http.request(request)
30
+ case response
31
+ when Net::HTTPSuccess, Net::HTTPRedirection
32
+ parse(response.body)
33
+ @start = @start + 10
34
+ if @totalhits > @start
35
+ @pages = @pages + 1
36
+ puts "Searching in: #{self.class} up to point #{@start}"
37
+ create_emails(response.body)
38
+ sleep(4)
39
+ search(@query)
40
+ else
41
+ puts "Searching in: #{self.class} up to point #{@start}"
42
+ search_emails(response.body)
43
+ end
44
+ else
45
+ return response.error!
46
+ end
47
+ end
48
+ rescue Net::HTTPFatalError
49
+ puts "Error: Something went wrong with the HTTP request"
50
+ end
51
+ end
52
+
53
+ def parse(string)
54
+ @totalhits = string.scan(/<p class="summary>"<strong>(\w)<\/strong>/) if @totalhits == 0
55
+ end
56
+
57
+ def search_people(string)
58
+ @people = string.scan(/<spam class="given-name">(*.)<\/spam><spam class="family-name">(*.)<\/spam>)/)
59
+ end
60
+ def search_person(name,last)
61
+ emails = Yahoo.new(50).search("first:\"#{name}\" last:\"#{last}\"").emails
62
+ emails.concat(Google.new(50).search("#{name} #{last}").emails).uniq!
63
+ end
64
+
65
+ def create_emails
66
+ @domain = + @query.match(/@/) ? @query : ("@" + @query)
67
+ @people.each do |person|
68
+ name = person[0]
69
+ last = person[1]
70
+ @emails << name + last + @domain
71
+ @emails << name[0] + last + @domain
72
+ @emails.concat(search_person(name,last))
73
+ end
74
+ print_emails(@emails)
75
+ end
76
+ end
@@ -0,0 +1,30 @@
1
+ %w{pdf/reader}.each { |lib| require lib }
2
+
3
+ class PageTextReceiver
4
+ attr_accessor :content
5
+
6
+ def initialize
7
+ @content = []
8
+ end
9
+
10
+ # Called when page parsing starts
11
+ def begin_page(arg = nil)
12
+ @content << ""
13
+ end
14
+
15
+ # record text that is drawn on the page
16
+ def show_text(string, *params)
17
+ @content.last << string.strip
18
+ end
19
+
20
+ # there's a few text callbacks, so make sure we process them all
21
+ alias :super_show_text :show_text
22
+ alias :move_to_next_line_and_show_text :show_text
23
+ alias :set_spacing_next_line_show_text :show_text
24
+
25
+ # this final text callback takes slightly different arguments
26
+ def show_text_with_positioning(*params)
27
+ params = params.first
28
+ params.each { |str| show_text(str) if str.kind_of?(String)}
29
+ end
30
+ end
@@ -0,0 +1,29 @@
1
+ class PGP
2
+ include Searchy
3
+
4
+ def initialize(maxhits=nil)
5
+ @emails = []
6
+ @lock = Mutex.new
7
+ @threads = []
8
+ end
9
+ attr_accessor :emails
10
+ def search(query)
11
+ @query = query
12
+ http = Net::HTTP.new("pgp.mit.edu",11371)
13
+ begin
14
+ http.start do |http|
15
+ request = Net::HTTP::Get.new( "/pks/lookup?search=#{@query}")
16
+ response = http.request(request)
17
+ case response
18
+ when Net::HTTPSuccess, Net::HTTPRedirection
19
+ puts "Searching #{self.class}"
20
+ search_emails(response.body)
21
+ else
22
+ return response.error!
23
+ end
24
+ end
25
+ rescue Net::HTTPFatalError
26
+ puts "Error: Something went wrong with the HTTP request"
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,248 @@
1
+ require 'digest/sha2'
2
+ require 'net/http'
3
+ require 'zip/zip'
4
+ require 'zip/zipfilesystem'
5
+ local_path = "#{File.dirname(__FILE__)}/"
6
+ require local_path + 'pdf2txt'
7
+ if RUBY_PLATFORM =~ /mingw|mswin/
8
+ require 'win32ole'
9
+ require local_path + 'wcol'
10
+ end
11
+
12
+
13
+
14
+ module Searchy
15
+ case RUBY_PLATFORM
16
+ when /mingw|mswin/
17
+ TEMP = "C:\\WINDOWS\\Temp\\"
18
+ else
19
+ TEMP = "/tmp/"
20
+ end
21
+
22
+ def search_emails(string)
23
+ string = string.gsub("<em>","") if self.class == Google #still not sure if this is going to work.
24
+ # OLD regex list = string.scan(/[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?\^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?/)
25
+ list = string.scan(/[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?\^_`{|}~-]+)*_at_(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\
26
+ [a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?\^_`{|}~-]+)*\sat\s(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\
27
+ [a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?\^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\
28
+ [a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?\^_`{|}~-]+)*\s@\s(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\
29
+ [a-z0-9!#$&'*+=?^_`{|}~-]+(?:\sdot\s[a-z0-9!#$&'*+=?\^_`{|}~-]+)*\sat\s(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\sdot\s)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?/)
30
+ @lock.synchronize do
31
+ print_emails(list)
32
+ @emails.concat(fix(list)).uniq!
33
+ end
34
+ end
35
+
36
+ def search_pdfs(urls)
37
+ while urls.size >= 1
38
+ @threads << Thread.new do
39
+ web = URI.parse(urls.pop)
40
+ puts "Searching in PDF: #{web.to_s}\n"
41
+ begin
42
+ http = Net::HTTP.new(web.host,80)
43
+ http.start do |http|
44
+ request = Net::HTTP::Get.new("#{web.path}#{web.query}")
45
+ response = http.request(request)
46
+ case response
47
+ when Net::HTTPSuccess, Net::HTTPRedirection
48
+ name = Searchy::TEMP + "#{hash_url(web.to_s)}.pdf"
49
+ open(name, "wb") do |file|
50
+ file.write(response.body)
51
+ end
52
+ begin
53
+ receiver = PageTextReceiver.new
54
+ pdf = PDF::Reader.file(name, receiver)
55
+ search_emails(receiver.content.inspect)
56
+ rescue PDF::Reader::UnsupportedFeatureError
57
+ puts "Encrypted PDF: Unable to parse it.\n"
58
+ rescue PDF::Reader::MalformedPDFError
59
+ puts "Malformed PDF: Unable to parse it.\n"
60
+ end
61
+ `rm "#{name}"`
62
+ else
63
+ return response.error!
64
+ end
65
+ end
66
+ rescue Net::HTTPFatalError
67
+ puts "Error: Something went wrong with the HTTP request.\n"
68
+ rescue Net::HTTPServerException
69
+ puts "Error: Not longer there. 404 Not Found.\n"
70
+ rescue
71
+ puts "Error: < .. SocketError .. >\n"
72
+ end
73
+ end
74
+ end
75
+ @threads.each {|t| t.join } if @threads != nil
76
+ end
77
+
78
+ if RUBY_PLATFORM =~ /mingw|mswin/
79
+ def search_docs(urls)
80
+ while urls.size >= 1
81
+ @threads << Thread.new do
82
+ web = URI.parse(urls.pop)
83
+ puts "Searching in DOC: #{web.to_s}\n"
84
+ begin
85
+ http = Net::HTTP.new(web.host,80)
86
+ http.start do |http|
87
+ request = Net::HTTP::Get.new("#{web.path}#{web.query}")
88
+ response = http.request(request)
89
+ case response
90
+ when Net::HTTPSuccess, Net::HTTPRedirection
91
+ name = Searchy::TEMP + "#{hash_url(web.to_s)}.doc"
92
+ open(name, "wb") do |file|
93
+ file.write(response.body)
94
+ end
95
+ begin
96
+ word = WIN32OLE.new('word.application')
97
+ word.documents.open(name)
98
+ word.selection.wholestory
99
+ search_emails(word.selection.text.chomp)
100
+ word.activedocument.close( false )
101
+ word.quit
102
+ rescue
103
+ puts "Something went wrong parsing the .doc}\n"
104
+ end
105
+ `rm "#{name}"`
106
+ else
107
+ return response.error!
108
+ end
109
+ end
110
+ rescue Net::HTTPFatalError
111
+ puts "Error: Something went wrong with the HTTP request.\n"
112
+ rescue Net::HTTPServerException
113
+ puts "Error: Not longer there. 404 Not Found.\n"
114
+ rescue
115
+ puts "Error: < .. SocketError .. >\n"
116
+ end
117
+ end
118
+ end
119
+ @threads.each {|t| t.join } if @threads != nil
120
+ end
121
+ end
122
+
123
+ def search_office_xml(urls)
124
+ while urls.size >= 1
125
+ @threads << Thread.new do
126
+ web = URI.parse(urls.pop)
127
+ format = web.scan(/docx|xlsx|pptx/i)[0]
128
+ puts "Searching in #{format.upcase}: #{web.to_s}\n"
129
+ begin
130
+ http = Net::HTTP.new(web.host,80)
131
+ http.start do |http|
132
+ request = Net::HTTP::Get.new("#{web.path}#{web.query}")
133
+ response = http.request(request)
134
+ case response
135
+ when Net::HTTPSuccess, Net::HTTPRedirection
136
+ name = Searchy::TEMP + "#{hash_url(web.to_s)}." + format
137
+ open(name, "wb") do |file|
138
+ file.write(response.body)
139
+ end
140
+ begin
141
+ Zip::ZipFile.open(name) do |zip|
142
+ text = z.entries.each { |e| zip.file.read(e.name) if e.name =~ /.xml$/}
143
+ search_emails(text)
144
+ end
145
+ rescue
146
+ puts "Something went wrong parsing the .#{format.downcase}\n"
147
+ end
148
+ `rm "#{name}"`
149
+ else
150
+ return response.error!
151
+ end
152
+ end
153
+ rescue Net::HTTPFatalError
154
+ puts "Error: Something went wrong with the HTTP request.\n"
155
+ rescue Net::HTTPServerException
156
+ puts "Error: Not longer there. 404 Not Found.\n"
157
+ rescue
158
+ puts "Error: < .. SocketError .. >\n"
159
+ end
160
+ end
161
+ end
162
+ @threads.each {|t| t.join } if @threads != nil
163
+ end
164
+
165
+ def search_txts(urls)
166
+ while urls.size >= 1
167
+ @threads << Thread.new do
168
+ web = URI.parse(urls.pop)
169
+ puts "Searching in #{web.to_s.scan(/txt|rtf|ans/i)[0].upcase}: #{web.to_s}\n"
170
+ begin
171
+ http = Net::HTTP.new(web.host,80)
172
+ http.start do |http|
173
+ request = Net::HTTP::Get.new("#{web.path}#{web.query}")
174
+ response = http.request(request)
175
+ case response
176
+ when Net::HTTPSuccess, Net::HTTPRedirection
177
+ search_emails(response.body)
178
+ else
179
+ return response.error!
180
+ end
181
+ end
182
+ rescue Net::HTTPFatalError
183
+ puts "Error: Something went wrong with the HTTP request\n"
184
+ rescue Net::HTTPServerException
185
+ puts "Error: Not longer there. 404 Not Found.\n"
186
+ rescue
187
+ puts "Error: < .... >"
188
+ end
189
+ end
190
+ end
191
+ @threads.each {|t| t.join } if @threads != nil
192
+ end
193
+
194
+ # HELPER METHODS ---------------------------------------------------------------------------------
195
+
196
+ def print_emails(list)
197
+ list.each do |email|
198
+ unless @emails.include?(email)
199
+ unless RUBY_PLATFORM =~ /mingw|mswin/
200
+ if email.match(/#{@query.gsub("@","").split('.')[0]}/)
201
+ puts "\033[31m" + email + "\033\[0m"
202
+ else
203
+ puts "\033[32m" + email + "\033\[0m"
204
+ end
205
+ else
206
+ if email.match(/#{@query.gsub("@","").split('.')[0]}/)
207
+ Wcol::color(12)
208
+ puts email
209
+ Wcol::color(7)
210
+ else
211
+ Wcol::color(2)
212
+ puts email
213
+ Wcol::color(7)
214
+ end
215
+ end
216
+ end
217
+ end
218
+ end
219
+
220
+ def hash_url(url)
221
+ Digest::SHA2.hexdigest("#{Time.now.to_f}--#{url}")
222
+ end
223
+
224
+ def fix(list)
225
+ list.each do |e|
226
+ e.gsub!(" at ","@")
227
+ e.gsub!("_at_","@")
228
+ e.gsub!(" dot ",".")
229
+ end
230
+ end
231
+
232
+ def clean( &block )
233
+ @emails.delete_if &block.call
234
+ end
235
+
236
+ def maxhits=( value )
237
+ @totalhits = value
238
+ end
239
+
240
+ def search_depth
241
+ search_pdfs @r_pdfs if @r_pdfs
242
+ search_txts @r_txts if @r_txts
243
+ search_office_xml @r_officexs if @r_officexs
244
+ if RUBY_PLATFORM =~ /mingw|mswin/
245
+ search_docs @r_docs if @r_docs
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,10 @@
1
+ require 'Win32API'
2
+ class Wcol
3
+ gsh = Win32API.new("kernel32", "GetStdHandle", ['L'], 'L')
4
+ @textAttr = Win32API.new("kernel32","SetConsoleTextAttribute", ['L','N'], 'I')
5
+ @h = gsh.call(-11)
6
+
7
+ def self.color(col)
8
+ @textAttr.call(@h,col)
9
+ end
10
+ end
@@ -0,0 +1,73 @@
1
+ %w{rubygems json cgi net/http}.each { |lib| require lib }
2
+ local_path = "#{File.dirname(__FILE__)}/"
3
+ %w{searchy keys}.each {|lib| require local_path + lib}
4
+
5
+ class Yahoo
6
+ include Searchy
7
+
8
+ def initialize(maxhits = nil, appid=nil, start=nil )
9
+ @appid = appid || Keys::YAHOO_APP_KEY
10
+ @start = start || 0
11
+ @totalhits = maxhits || 0
12
+ @emails = []
13
+ @r_urls = Queue.new
14
+ @r_docs = Queue.new
15
+ @r_pdfs = Queue.new
16
+ @r_officexs = Queue.new
17
+ @r_txts = Queue.new
18
+ @threads = []
19
+ @lock = Mutex.new
20
+ end
21
+ attr_accessor :emails, :appid
22
+
23
+ def search(query)
24
+ @query = query
25
+ begin
26
+ http = Net::HTTP.new("boss.yahooapis.com",80)
27
+ http.start do |http|
28
+ request = Net::HTTP::Get.new("/ysearch/web/v1/" + CGI.escape(query) +
29
+ "?appid="+ @appid +
30
+ "&format=json&count=50"+
31
+ "&start=#{@start}" )
32
+ response = http.request(request)
33
+ case response
34
+ when Net::HTTPSuccess, Net::HTTPRedirection
35
+ parse(response.body)
36
+ @start = @start + 50
37
+ if @totalhits > @start
38
+ puts "Searching in URL: #{self.class} up to point #{@start}"
39
+ search_emails(response.body)
40
+ sleep(4)
41
+ search(@query)
42
+ else
43
+ puts "Searching in URL: #{self.class} up to point #{@start}"
44
+ search_emails(response.body)
45
+ end
46
+ else
47
+ return response.error!
48
+ end
49
+ end
50
+ rescue Net::HTTPFatalError
51
+ puts "Error: Something went wrong with the HTTP request"
52
+ end
53
+ end
54
+
55
+ def parse(json)
56
+ doc = JSON.parse(json)
57
+ @totalhits = doc["ysearchresponse"]["totalhits"].to_i if @totalhits == 0
58
+ doc["ysearchresponse"]["resultset_web"].each do |result|
59
+ case result["url"]
60
+ when /.pdf$/i
61
+ @r_pdfs << result["url"]
62
+ when /.docx$|.xlsx$|.pptx$/i
63
+ @r_officexs << result["url"]
64
+ when /.doc$/i
65
+ @r_docs << result["url"]
66
+ when /.txt$|.rtf$|ans$/i
67
+ @r_txts << result["url"]
68
+ else
69
+ @r_urls << result["url"]
70
+ end
71
+ end
72
+ end
73
+ end
data/lib/esearchy.rb ADDED
@@ -0,0 +1,79 @@
1
+ local_path = "#{File.dirname(__FILE__) + '/esearchy/'}"
2
+ %w{google bing yahoo PGP keys}.each { |lib| require local_path + lib }
3
+
4
+ class ESearchy
5
+ def initialize(options={}, &block)
6
+ @query = options[:query]
7
+ @depth_search = options[:depth] || true
8
+ @maxhits = options[:maxhits]
9
+ @engines = options[:engines] || {"Google" => Google,
10
+ "Bing" => Bing,
11
+ "Yahoo" => Yahoo,
12
+ "PGP" => PGP }
13
+ @engines.each {|n,e| @engines[n] = e.new(@maxhits)}
14
+ @emails = Array.new
15
+ @threads = Array.new
16
+ block.call(self) if block_given?
17
+ end
18
+ attr_accessor :engines, :query, :threads, :depth_search
19
+ attr_reader :maxhits
20
+
21
+ def search(query=nil)
22
+ @engines.each do |n,e|
23
+ puts "+--- Launching Search for #{n} ---+\n"
24
+ e.search(query || @query)
25
+ e.search_depth if depth_search?
26
+ puts "+--- Finishing Search for #{n} ---+\n"
27
+ end
28
+ end
29
+
30
+ def emails
31
+ @engines.each do |n,e|
32
+ @emails.concat(e.emails).uniq!
33
+ end
34
+ @emails
35
+ end
36
+
37
+ def clean(&block)
38
+ emails.each do |e|
39
+ e.delete_if block.call
40
+ end
41
+ end
42
+
43
+ def maxhits=(value)
44
+ @engines.each do |n,e|
45
+ e.maxhits = value
46
+ end
47
+ end
48
+
49
+ def yahoo_key=(value)
50
+ @engines['Yahoo'].appid = value
51
+ end
52
+
53
+ def bing_key=(value)
54
+ @engines['Bing'].appid = value
55
+ end
56
+
57
+ def save_to_file(file)
58
+ open(file,"a") do |f|
59
+ emails.each { |e| f << e + "\n" }
60
+ end
61
+ end
62
+
63
+ def filter(regex)
64
+ emails.each.select { |email| email =~ regex }
65
+ end
66
+
67
+ def self.create(query=nil, &block)
68
+ self.new :query => query do |search|
69
+ block.call(search) if block_given?
70
+ end
71
+ end
72
+
73
+ private
74
+
75
+ def depth_search?
76
+ @depth_search
77
+ end
78
+
79
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: FreedomCoder-esearchy
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.5
5
+ platform: ruby
6
+ authors:
7
+ - Matias P. Brutti
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-05-16 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: pdf/reader
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.7.5
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: json
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.1.6
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: rubyzip
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 0.9.1
44
+ version:
45
+ description:
46
+ email: matiasbrutti@gmail.com
47
+ executables:
48
+ - esearchy
49
+ extensions: []
50
+
51
+ extra_rdoc_files:
52
+ - README.rdoc
53
+ files:
54
+ - esearchy.rb
55
+ - bin
56
+ - bin/esearchy
57
+ - data
58
+ - data/bing.key
59
+ - data/yahoo.key
60
+ - lib
61
+ - lib/esearchy.rb
62
+ - lib/esearchy
63
+ - lib/esearchy/bing.rb
64
+ - lib/esearchy/google.rb
65
+ - lib/esearchy/googlegroups.rb
66
+ - lib/esearchy/keys.rb
67
+ - lib/esearchy/linkedin.rb
68
+ - lib/esearchy/pdf2txt.rb
69
+ - lib/esearchy/pgp.rb
70
+ - lib/esearchy/searchy.rb
71
+ - lib/esearchy/yahoo.rb
72
+ - lib/esearchy/wcol.rb
73
+ - README.rdoc
74
+ has_rdoc: true
75
+ homepage: http://freedomcoder.com.ar/esearchy
76
+ post_install_message:
77
+ rdoc_options: []
78
+
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: "0"
92
+ version:
93
+ requirements: []
94
+
95
+ rubyforge_project:
96
+ rubygems_version: 1.2.0
97
+ signing_key:
98
+ specification_version: 2
99
+ summary: A library to search for emails in search engines
100
+ test_files: []
101
+