esearchy 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +112 -0
- data/bin/esearchy +334 -0
- data/lib/esearchy/LocalEngines/directory.rb +16 -0
- data/lib/esearchy/OtherEngines/googlegroups.rb +27 -0
- data/lib/esearchy/OtherEngines/ldap.rb +44 -0
- data/lib/esearchy/OtherEngines/pgp.rb +22 -0
- data/lib/esearchy/OtherEngines/spider.rb +43 -0
- data/lib/esearchy/OtherEngines/usenet.rb +22 -0
- data/lib/esearchy/SearchEngines/altavista.rb +25 -0
- data/lib/esearchy/SearchEngines/bing.rb +32 -0
- data/lib/esearchy/SearchEngines/google.rb +30 -0
- data/lib/esearchy/SearchEngines/yahoo.rb +32 -0
- data/lib/esearchy/SocialEngines/classmates.rb +33 -0
- data/lib/esearchy/SocialEngines/googleprofiles.rb +36 -0
- data/lib/esearchy/SocialEngines/linkedin.rb +35 -0
- data/lib/esearchy/SocialEngines/linkedinfull.rb +100 -0
- data/lib/esearchy/SocialEngines/naymz.rb +36 -0
- data/lib/esearchy/bugmenot.rb +26 -0
- data/lib/esearchy/docs.rb +267 -0
- data/lib/esearchy/esearchy.rb +195 -0
- data/lib/esearchy/genericengine.rb +153 -0
- data/lib/esearchy/localengines.rb +1 -0
- data/lib/esearchy/otherengines.rb +5 -0
- data/lib/esearchy/searchengines.rb +4 -0
- data/lib/esearchy/socialengines.rb +4 -0
- data/lib/esearchy/useragent.rb +188 -0
- data/lib/esearchy.rb +24 -0
- metadata +129 -0
@@ -0,0 +1,30 @@
|
|
1
|
+
module ESearchy
|
2
|
+
module SearchEngines
|
3
|
+
class Google < ESearchy::GenericEngine
|
4
|
+
ENGINE = "www.google.com"
|
5
|
+
PORT = 80
|
6
|
+
NUM = 100
|
7
|
+
TYPE = 1
|
8
|
+
|
9
|
+
def search
|
10
|
+
@querypath = "/cse?&safe=off&num=100&site=&q=" + @query + "&btnG=Search&start="
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse( html )
|
15
|
+
hits = html.scan(/<\/b> of [\w\s]*<b>(.*)<\/b> for /)
|
16
|
+
if hits.empty? or hits == nil
|
17
|
+
@totalhits = 0
|
18
|
+
else
|
19
|
+
@totalhits = totalhits(hits[0][0].gsub(",","").to_i)
|
20
|
+
end
|
21
|
+
super html.scan(/<div class=g><span class="b w xsm">\[([A-Z]+)\]<\/span> \
|
22
|
+
<h2 class=r><a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)"|<h2 class=r><a href="\
|
23
|
+
([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)"/)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module ESearchy
|
2
|
+
module SearchEngines
|
3
|
+
class Yahoo < ESearchy::GenericEngine
|
4
|
+
ENGINE = "boss.yahooapis.com"
|
5
|
+
PORT = 80
|
6
|
+
NUM = 50
|
7
|
+
TYPE = 1
|
8
|
+
|
9
|
+
def search
|
10
|
+
@querypath = "/ysearch/web/v1/" + @query +
|
11
|
+
"?appid="+ @appid + "&format=json&count=50" or
|
12
|
+
raise ESearchyMissingAppID, "Missing AppID <Class.appid=>"
|
13
|
+
super
|
14
|
+
end
|
15
|
+
|
16
|
+
def appid=(value)
|
17
|
+
@appid = value
|
18
|
+
end
|
19
|
+
|
20
|
+
def parse(json)
|
21
|
+
doc = JSON.parse(json)
|
22
|
+
hits = doc["ysearchresponse"]["totalhits"].to_i
|
23
|
+
if hits == nil or hits == 0
|
24
|
+
@totalhits = 0
|
25
|
+
else
|
26
|
+
@totalhits = totalhits(hits)
|
27
|
+
end
|
28
|
+
super doc["ysearchresponse"]["resultset_web"]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module ESearchy
|
2
|
+
module SocialEngines
|
3
|
+
class Classmates < ESearchy::GenericEngine
|
4
|
+
ENGINE = "www.google.com"
|
5
|
+
PORT = 80
|
6
|
+
NUM = 100
|
7
|
+
TYPE = 2
|
8
|
+
|
9
|
+
def search
|
10
|
+
@querypath = "/cse?q=site%3Awww.classmates.com+%22work+at+" + CGI.escape(@company) +
|
11
|
+
"%22&hl=en&cof=&num=100&filter=0&safe=off&start=" or raise ESearchyMissingCompany, "Mssing website url Object.company=(value)"
|
12
|
+
super
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse( html )
|
16
|
+
hits = html.scan(/<\/b> of[ about | ]<b>(.*)<\/b> from/)
|
17
|
+
if hits.empty? or hits == nil
|
18
|
+
@totalhits = 0
|
19
|
+
else
|
20
|
+
@totalhits = totalhits(hits[0][0].gsub(",","").to_i)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def crawl_people(html)
|
25
|
+
html.scan(/<a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)" class=l[\sonmousedown="return clk(this.href,'','','res','\d','')"]*>([\w\s]*) \|/).each do |profile|
|
26
|
+
name,last = profile[1].split(" ")
|
27
|
+
@people << [name,last]
|
28
|
+
@results << [[name,last], "P", self.class.to_s.upcase, "N"]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module ESearchy
|
2
|
+
module SocialEngines
|
3
|
+
class GoogleProfiles < ESearchy::GenericEngine
|
4
|
+
ENGINE = "www.google.com"
|
5
|
+
PORT = 80
|
6
|
+
NUM = 100
|
7
|
+
TYPE = 2
|
8
|
+
|
9
|
+
def search
|
10
|
+
@querypath = "/cse?q=site:www.google.com+intitle:%22Google+" +
|
11
|
+
"Profile%22+%22Companies+I%27ve+worked+for%22+%22at+" +
|
12
|
+
CGI.escape(@company) + "%22&hl=en&cof=&num=100&filter=0&safe=off&start=" or
|
13
|
+
raise ESearchyMissingCompany, "Mssing website url Object.company=(value)"
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse( html )
|
18
|
+
#Results <b>1</b> - <b>8</b> of <b>8</b> from <b>www.google.com</b>
|
19
|
+
hits = html.scan(/<\/b> of <b>(.*)<\/b> from /)
|
20
|
+
if hits.empty? or hits == nil
|
21
|
+
@totalhits = 0
|
22
|
+
else
|
23
|
+
@totalhits = totalhits(hits[0][0].gsub(",","").to_i) unless @was_here
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def crawl_people(text)
|
28
|
+
text.scan(/<a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)" class=l[\sonmousedown="return clk(this.href,'','','res','\d','')"]*>([\w\s]*) -/).each do |profile|
|
29
|
+
name,last = profile[1].split(" ")
|
30
|
+
@people << [name,last]
|
31
|
+
@results << [[name,last], "P", self.class.to_s.upcase, "N"]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module ESearchy
|
2
|
+
module SocialEngines
|
3
|
+
class LinkedIn < ESearchy::GenericEngine
|
4
|
+
ENGINE = "www.google.com"
|
5
|
+
PORT = 80
|
6
|
+
NUM = 100
|
7
|
+
TYPE = 2
|
8
|
+
|
9
|
+
def search
|
10
|
+
@querypath = "/cse?q=site%3Awww.linkedin.com/in+%22at+" +
|
11
|
+
CGI.escape(@company) + "%22&hl=en&cof=&num=100&filter=0&safe=off&start=" or
|
12
|
+
raise ESearchyMissingCompany, "Mssing website url Object.company=(value)"
|
13
|
+
super
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse( html )
|
17
|
+
#Results <b>1</b> - <b>8</b> of <b>8</b> from <b>www.google.com</b>
|
18
|
+
hits = html.scan(/<\/b> of [\w\s]*<b>(.*)<\/b> from /)
|
19
|
+
if hits.empty? or hits == nil
|
20
|
+
@totalhits = 0
|
21
|
+
else
|
22
|
+
@totalhits = totalhits(hits[0][0].gsub(",","").to_i)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def crawl_people(text)
|
27
|
+
text.scan(/<a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)" class=l[\sonmousedown="return clk(this.href,'','','res','\d','')"]*>([\w\s]*) -/).each do |profile|
|
28
|
+
name,last = profile[1].split(" ")
|
29
|
+
@people << [name,last]
|
30
|
+
@results << [[name,last], "P", self.class.to_s.upcase, "N"]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
require 'net/https'
|
2
|
+
module ESearchy
|
3
|
+
module SocialEngines
|
4
|
+
class LinkedIn < ESearchy::GenericEngine
|
5
|
+
ENGINE = "www.linkedin.com"
|
6
|
+
PORT = 80
|
7
|
+
NUM = 1
|
8
|
+
TYPE = 2
|
9
|
+
|
10
|
+
def search
|
11
|
+
@querypath = "/search?search=¤tCompany=co&company=" + CGI.escape(@company) +
|
12
|
+
"&proposalType=Y&newnessType=Y&pplSearchOrigin=MDYS&searchLocationType=Y&page_num=" or
|
13
|
+
raise ESearchyMissingCompany, "Mssing website url Object.company=(value)"
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse( html )
|
18
|
+
p html
|
19
|
+
p html.scan(/<p class="summary">[\n\s]+<strong>(.*)<\/strong> results/)#.gsub(/,|./,"")
|
20
|
+
#unless @was_here
|
21
|
+
# @totalhits= totalhits html.scan(/<p class="summary">[\n\s]+<strong>(.*)<\/strong> results/)[0][0].to_i
|
22
|
+
#end
|
23
|
+
end
|
24
|
+
|
25
|
+
def credentials=(c)
|
26
|
+
@user = c[0].to_s
|
27
|
+
@pwd = c[1].to_s
|
28
|
+
LinkedIn.const_set :HEADER, login
|
29
|
+
self.start=(1)
|
30
|
+
end
|
31
|
+
|
32
|
+
def maxhits=(v)
|
33
|
+
super v/10
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
def crawl_people(html)
|
38
|
+
list = html.scan(/title="View profile">[\n\s]+<span class="given-name">(.*)<\/span>\
|
39
|
+
[\n\s]+<span class="family-name">(.*)<\/span>/)
|
40
|
+
@people.concat(list).uniq!
|
41
|
+
list.each { |p| @results << [p, "P", self.class.to_s.upcase, "N"] }
|
42
|
+
end
|
43
|
+
|
44
|
+
def login
|
45
|
+
begin
|
46
|
+
get ENGINE, PORT, "/secure/login?trk=hb_signin", {'User-Agent' => "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.1.5) Gecko/20091102"} do |r|
|
47
|
+
@l_headers = r.to_hash
|
48
|
+
@l_headers.each {|k,v| @l_headers[k] = v.to_s}
|
49
|
+
@csrfToken = r.body.scan(/<input type="hidden" name="csrfToken" value="ajax:(.*)">/)[0][0]
|
50
|
+
puts "------------------------------------------------------------------------------------"
|
51
|
+
puts "------------------------------------------------------------------------------------"
|
52
|
+
p @l_headers
|
53
|
+
p @csrfToken
|
54
|
+
puts "------------------------------------------------------------------------------------"
|
55
|
+
puts "------------------------------------------------------------------------------------"
|
56
|
+
end
|
57
|
+
http = Net::HTTP.new(ENGINE,443)
|
58
|
+
http.use_ssl = true
|
59
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
60
|
+
http.start do |http|
|
61
|
+
body = "csrfToken=ajax:#{@csrfToken}" +
|
62
|
+
"session_key=#{@user}" +
|
63
|
+
"&session_password=#{@pwd}" +
|
64
|
+
"&session_login=Sign+In&session_login=&session_rikey="
|
65
|
+
|
66
|
+
@l_headers['Host'] = "www.linkedin.com"
|
67
|
+
@l_headers['User-Agent'] = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5"
|
68
|
+
@l_headers['Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
69
|
+
@l_headers['Accept-Language'] = "en-us,en;q=0.5"
|
70
|
+
@l_headers['Accept-Charset'] = "ISO-8859-1,utf-8;q=0.7,*;q=0.7"
|
71
|
+
@l_headers['Keep-Alive'] = "300"
|
72
|
+
@l_headers['Connection'] = "keep-alive"
|
73
|
+
@l_headers['Referer'] = "https://www.linkedin.com/secure/login?trk=hb_signin"
|
74
|
+
@l_headers['Cookie'] = "JSESSIONID=\"ajax:5367441617418183976\"; visit=G; bcookie=\"v=1&8231965c-b4b7-48f2-8349-76514ba89b69\"; lang=\"v=2&lang=en&c=\"; NSC_MC_QH_MFP=e242089229a3; __utma=226841088.2037160969.1259078198.1259078198.1259078198.1; __utmb=226841088.2.10.1259078198; __utmc=226841088; __utmz=226841088.1259078198.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmv=226841088.user; leo_auth_token=\"GST:9_t6crYtB4AWStfoqhWQ6LYPKakWfHk_dotQyAHagiRX1HlEvqVt5-:1259081816:56d4aecb2e985d7f8a30d74e758f261ea8b92065\"; NSC_MC_WT_YUSL_IUUQ=e2420f8429a0"
|
75
|
+
@l_headers['Content-Type'] = "application/x-www-form-urlencoded"
|
76
|
+
@l_headers['Content-Length'] = body.size.to_s
|
77
|
+
|
78
|
+
request = Net::HTTP::Post.new("/secure/login", @l_headers)
|
79
|
+
request.body = CGI.escape(body)
|
80
|
+
response = http.request(request)
|
81
|
+
case response
|
82
|
+
when Net::HTTPSuccess, Net::HTTPRedirection
|
83
|
+
puts "------------------------------------------------------------------------------"
|
84
|
+
puts "------------------------------------------------------------------------------"
|
85
|
+
p response.to_hash
|
86
|
+
p response.body
|
87
|
+
puts "-----------------------------------------------------------------------------"
|
88
|
+
puts "-----------------------------------------------------------------------------"
|
89
|
+
return {'Cookie' => response['Set-Cookie'], 'User-Agent' => UserAgent::fetch}
|
90
|
+
else
|
91
|
+
return response.error!
|
92
|
+
end
|
93
|
+
end
|
94
|
+
rescue Net::HTTPFatalError
|
95
|
+
D "Error: Something went wrong while login to LinkedIn.\n\t${$@}"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module ESearchy
|
2
|
+
module SocialEngines
|
3
|
+
class Naymz < ESearchy::GenericEngine
|
4
|
+
ENGINE = "www.google.com"
|
5
|
+
PORT = 80
|
6
|
+
NUM = 100
|
7
|
+
TYPE = 2
|
8
|
+
|
9
|
+
def search
|
10
|
+
@querypath = "/cse?q=site:naymz.com%20%2B%20%22@%20" + CGI.escape(@company) +
|
11
|
+
"%22&hl=en&cof=&num=100&filter=0&safe=off&start="
|
12
|
+
super
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse( html )
|
16
|
+
#</b> of about <b>760</b> from <b>
|
17
|
+
hits = html.scan(/<\/b> of about <b>(.*)<\/b> from/)
|
18
|
+
if hits.empty? or hits == nil
|
19
|
+
@totalhits = 0
|
20
|
+
else
|
21
|
+
@totalhits= totalhits hits[0][0].gsub(",","").to_i unless @was_here
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def crawl_people(html)
|
26
|
+
html.scan(/<a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)" class=l[\sonmousedown="return clk(this.href,'','','res','\d','')"]*>([\w\s]*) -/).each do |profile|
|
27
|
+
person = profile[1].split(" ").delete_if do
|
28
|
+
|x| x =~ /mr.|mr|ms.|ms|phd.|dr.|dr|phd|phd./i
|
29
|
+
end
|
30
|
+
@people << person
|
31
|
+
@results << [person, "P", self.class.to_s.upcase, "N"]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'base64'
|
2
|
+
|
3
|
+
module ESearchy
|
4
|
+
class Bugmenot
|
5
|
+
def self.fetch(domain = "www.linkedin.com")
|
6
|
+
begin
|
7
|
+
url = Net::HTTP.get URI.parse("http://www.bugmenot.com/view/#{domain}")
|
8
|
+
key = ( url.scan(/var key =(.*);/)[0][0].to_i + 112 ) / 12
|
9
|
+
|
10
|
+
user, pass = url.scan(/tr><th>Username <\/th><td><script>d\('(.*)'\);<\/script><\/td><\/tr>
|
11
|
+
[\n\s]+<tr><th>Password <\/th><td><script>d\('(.*)'\);<\/script><\/td><\/tr>/)[0]
|
12
|
+
user = decode(user,key)
|
13
|
+
pass = decode(pass,key)
|
14
|
+
return [user, pass]
|
15
|
+
rescue
|
16
|
+
return [nil,nil]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
def decode(input, offset)
|
22
|
+
# thanks tlrobinson @ github
|
23
|
+
input.unpack("m*")[0][4..-1].unpack("C*").map{|c| c - offset }.pack("C*")
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,267 @@
|
|
1
|
+
module ESearchy
|
2
|
+
class PageTextReceiver
|
3
|
+
attr_accessor :content
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@content = []
|
7
|
+
end
|
8
|
+
|
9
|
+
# Called when page parsing starts
|
10
|
+
def begin_page(arg = nil)
|
11
|
+
@content << ""
|
12
|
+
end
|
13
|
+
|
14
|
+
# record text that is drawn on the page
|
15
|
+
def show_text(string, *params)
|
16
|
+
@content.last << string.strip
|
17
|
+
end
|
18
|
+
|
19
|
+
# there's a few text callbacks, so make sure we process them all
|
20
|
+
alias :super_show_text :show_text
|
21
|
+
alias :move_to_next_line_and_show_text :show_text
|
22
|
+
alias :set_spacing_next_line_show_text :show_text
|
23
|
+
|
24
|
+
# this final text callback takes slightly different arguments
|
25
|
+
def show_text_with_positioning(*params)
|
26
|
+
params = params.first
|
27
|
+
params.each { |str| show_text(str) if str.kind_of?(String)}
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class Docs
|
32
|
+
case RUBY_PLATFORM
|
33
|
+
when /mingw|mswin/
|
34
|
+
TEMP = "C:\\WINDOWS\\Temp\\"
|
35
|
+
else
|
36
|
+
TEMP = "/tmp/"
|
37
|
+
end
|
38
|
+
attr_reader :documents, :emails, :results
|
39
|
+
|
40
|
+
def initialize(doc=nil, size = 10485760)
|
41
|
+
case doc
|
42
|
+
when Array
|
43
|
+
@@documents = Queue.new
|
44
|
+
self.merge doc
|
45
|
+
else
|
46
|
+
@@documents = doc || Queue.new
|
47
|
+
end
|
48
|
+
@size = size
|
49
|
+
@emails = []
|
50
|
+
@results = []
|
51
|
+
@lock = Mutex.new
|
52
|
+
end
|
53
|
+
|
54
|
+
## Class methods
|
55
|
+
def self.search(doc)
|
56
|
+
self.new(doc)
|
57
|
+
search(doc)
|
58
|
+
end
|
59
|
+
|
60
|
+
def merge(array)
|
61
|
+
array.each {|a| push(a) }
|
62
|
+
end
|
63
|
+
|
64
|
+
def self.push(doc)
|
65
|
+
push(doc)
|
66
|
+
end
|
67
|
+
|
68
|
+
def push(doc)
|
69
|
+
@@documents.push(doc)
|
70
|
+
end
|
71
|
+
|
72
|
+
def local_search
|
73
|
+
threads = []
|
74
|
+
while @documents.size >=1
|
75
|
+
threads << Thread.new do
|
76
|
+
doc = @@documents.pop
|
77
|
+
detect_type(doc.split(".")[-1], doc)
|
78
|
+
end
|
79
|
+
threads.each {|t| t.join } if @threads != nil
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def search
|
84
|
+
threads = []
|
85
|
+
while @@documents.size >=1
|
86
|
+
threads << Thread.new do
|
87
|
+
document = @@documents.pop
|
88
|
+
url = document[0].gsub(' ','+')
|
89
|
+
format = document[1]
|
90
|
+
if data = download(url)
|
91
|
+
name = save_to_disk(url, format, data)
|
92
|
+
detect_type(format,name)
|
93
|
+
remove_from_disk(name)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
threads.each {|t| t.join } if threads != nil
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def detect_type(format,name)
|
103
|
+
case format
|
104
|
+
when /.pdf/
|
105
|
+
pdf(name)
|
106
|
+
when /.doc/
|
107
|
+
doc(name)
|
108
|
+
when /txt|rtf|ans/
|
109
|
+
plain(name)
|
110
|
+
when /.docx|.xlsx|.pptx|.odt|.odp|.ods|.odb/
|
111
|
+
xml(name)
|
112
|
+
else
|
113
|
+
D "Error: Not currently parsing #{format}"
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def download(doc)
|
118
|
+
web = URI.parse(doc)
|
119
|
+
begin
|
120
|
+
http = Net::HTTP.new(web.host,80)
|
121
|
+
http.start do |http|
|
122
|
+
request = Net::HTTP::Head.new("#{web.path}#{web.query}")
|
123
|
+
response = http.request(request)
|
124
|
+
if response.content_length < @size
|
125
|
+
D "Downloading document: #{web.to_s}\n"
|
126
|
+
request = Net::HTTP::Get.new("#{web.path}#{web.query}")
|
127
|
+
response = http.request(request)
|
128
|
+
case response
|
129
|
+
when Net::HTTPSuccess, Net::HTTPRedirection
|
130
|
+
return response.body
|
131
|
+
else
|
132
|
+
return response.error!
|
133
|
+
end
|
134
|
+
else
|
135
|
+
D "Debug: Skipping #{web.to_s}. bigger than 10MB.\n"
|
136
|
+
end
|
137
|
+
end
|
138
|
+
rescue Net::HTTPFatalError
|
139
|
+
D "Error: HTTPFatalError - Unable to download.\n"
|
140
|
+
rescue Net::HTTPServerException
|
141
|
+
D "Error: Not longer there. 404 Not Found.\n"
|
142
|
+
rescue
|
143
|
+
D "Error: < .. SocketError .. >\n"
|
144
|
+
end
|
145
|
+
nil
|
146
|
+
end
|
147
|
+
|
148
|
+
def save_to_disk(url, format, data)
|
149
|
+
name = TEMP + "#{hash_url(url)}" + format
|
150
|
+
open(name, "wb") { |file| file.write(data) }
|
151
|
+
name
|
152
|
+
end
|
153
|
+
|
154
|
+
def remove_from_disk(name)
|
155
|
+
`rm "#{name}"`
|
156
|
+
end
|
157
|
+
|
158
|
+
def hash_url(url)
|
159
|
+
Digest::SHA2.hexdigest("#{Time.now.to_f}--#{url}")
|
160
|
+
end
|
161
|
+
|
162
|
+
def pdf(name)
|
163
|
+
begin
|
164
|
+
receiver = PageTextReceiver.new
|
165
|
+
pdf = PDF::Reader.file(name, receiver)
|
166
|
+
search_emails(receiver.content.inspect)
|
167
|
+
rescue PDF::Reader::UnsupportedFeatureError
|
168
|
+
D "Error: Encrypted PDF - Unable to parse.\n"
|
169
|
+
rescue PDF::Reader::MalformedPDFError
|
170
|
+
D "Error: Malformed PDF - Unable to parse.\n"
|
171
|
+
rescue
|
172
|
+
D "Error: Unknown - Unable to parse.\n"
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def doc(name)
|
177
|
+
if RUBY_PLATFORM =~ /mingw|mswin/
|
178
|
+
begin
|
179
|
+
word(name)
|
180
|
+
rescue
|
181
|
+
antiword(name)
|
182
|
+
end
|
183
|
+
elsif RUBY_PLATFORM =~ /linux|darwin/
|
184
|
+
begin
|
185
|
+
antiword(name)
|
186
|
+
rescue
|
187
|
+
D "Error: Unable to parse .doc"
|
188
|
+
end
|
189
|
+
else
|
190
|
+
D "Error: Platform not supported."
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def word(name)
|
195
|
+
word = WIN32OLE.new('word.application')
|
196
|
+
word.documents.open(name)
|
197
|
+
word.selection.wholestory
|
198
|
+
search_emails(word.selection.text.chomp)
|
199
|
+
word.activedocument.close( false )
|
200
|
+
word.quit
|
201
|
+
end
|
202
|
+
|
203
|
+
def antiword(name)
|
204
|
+
case RUBY_PLATFORM
|
205
|
+
when /mingw|mswin/
|
206
|
+
if File.exists?("C:\\antiword\\antiword.exe")
|
207
|
+
search_emails(`C:\\antiword\\antiword.exe "#{name}" -f -s`)
|
208
|
+
end
|
209
|
+
when /linux|darwin/
|
210
|
+
if File.exists?("/usr/bin/antiword") or
|
211
|
+
File.exists?("/usr/local/bin/antiword") or
|
212
|
+
File.exists?("/opt/local/bin/antiword")
|
213
|
+
search_emails(`antiword "#{name}" -f -s`)
|
214
|
+
end
|
215
|
+
else
|
216
|
+
# This G h e t t o but, for now it works on emails
|
217
|
+
# that do not contain Capital letters:)
|
218
|
+
D "Debug: Using the Ghetto way."
|
219
|
+
search_emails(File.open(name).readlines[0..19].to_s)
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
def plain(name)
|
224
|
+
search_emails(File.open(name).readlines.to_s)
|
225
|
+
end
|
226
|
+
|
227
|
+
def xml(name)
|
228
|
+
begin
|
229
|
+
Zip::ZipFile.open(name) do |zip|
|
230
|
+
text = z.entries.each { |e| zip.file.read(e.name) if e.name =~ /.xml$/}
|
231
|
+
search_emails(text)
|
232
|
+
end
|
233
|
+
rescue
|
234
|
+
D "Error: Unable to parse .#{name.scan(/\..[a-z]*$/)}\n"
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
def search_emails(text)
|
239
|
+
list = text.scan(/[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*_at_\
|
240
|
+
(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|\
|
241
|
+
[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*\sat\s(?:[a-z0-9](?:[a-z0-9-]\
|
242
|
+
*[a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|[a-z0-9!#$&'*+=?^_`{|}~-]+\
|
243
|
+
(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|\
|
244
|
+
[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*\s@\s(?:[a-z0-9](?:[a-z0-9-]*\
|
245
|
+
[a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\sdot\s[a-z0-9!#$&'*+=?^_`\
|
246
|
+
{|}~-]+)*\sat\s(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\sdot\s)+[a-z](?:[a-z-]*[a-z])??/i)
|
247
|
+
@lock.synchronize do
|
248
|
+
#print_(list)
|
249
|
+
c_list = fix(list)
|
250
|
+
@emails.concat(c_list).uniq!
|
251
|
+
c_list.zip do |e|
|
252
|
+
@results << [e[0], "E", self.class.to_s.upcase,
|
253
|
+
e[0].match(/#{CGI.unescape(ESearchy::Search.query).gsub("@","").split('.')[0]}/) ? "T" : "F"]
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
def fix(list)
|
259
|
+
list.each do |e|
|
260
|
+
e.gsub!(" at ","@")
|
261
|
+
e.gsub!("_at_","@")
|
262
|
+
e.gsub!(" dot ",".")
|
263
|
+
e.gsub!(/[+0-9]{0,3}[0-9()]{3,5}[-]{0,1}[0-9]{3,4}[-]{0,1}[0-9]{3,5}/,"")
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
267
|
+
end
|