esearchy 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +112 -0
- data/bin/esearchy +334 -0
- data/lib/esearchy/LocalEngines/directory.rb +16 -0
- data/lib/esearchy/OtherEngines/googlegroups.rb +27 -0
- data/lib/esearchy/OtherEngines/ldap.rb +44 -0
- data/lib/esearchy/OtherEngines/pgp.rb +22 -0
- data/lib/esearchy/OtherEngines/spider.rb +43 -0
- data/lib/esearchy/OtherEngines/usenet.rb +22 -0
- data/lib/esearchy/SearchEngines/altavista.rb +25 -0
- data/lib/esearchy/SearchEngines/bing.rb +32 -0
- data/lib/esearchy/SearchEngines/google.rb +30 -0
- data/lib/esearchy/SearchEngines/yahoo.rb +32 -0
- data/lib/esearchy/SocialEngines/classmates.rb +33 -0
- data/lib/esearchy/SocialEngines/googleprofiles.rb +36 -0
- data/lib/esearchy/SocialEngines/linkedin.rb +35 -0
- data/lib/esearchy/SocialEngines/linkedinfull.rb +100 -0
- data/lib/esearchy/SocialEngines/naymz.rb +36 -0
- data/lib/esearchy/bugmenot.rb +26 -0
- data/lib/esearchy/docs.rb +267 -0
- data/lib/esearchy/esearchy.rb +195 -0
- data/lib/esearchy/genericengine.rb +153 -0
- data/lib/esearchy/localengines.rb +1 -0
- data/lib/esearchy/otherengines.rb +5 -0
- data/lib/esearchy/searchengines.rb +4 -0
- data/lib/esearchy/socialengines.rb +4 -0
- data/lib/esearchy/useragent.rb +188 -0
- data/lib/esearchy.rb +24 -0
- metadata +129 -0
@@ -0,0 +1,30 @@
|
|
1
|
+
module ESearchy
|
2
|
+
module SearchEngines
|
3
|
+
class Google < ESearchy::GenericEngine
|
4
|
+
ENGINE = "www.google.com"
|
5
|
+
PORT = 80
|
6
|
+
NUM = 100
|
7
|
+
TYPE = 1
|
8
|
+
|
9
|
+
def search
|
10
|
+
@querypath = "/cse?&safe=off&num=100&site=&q=" + @query + "&btnG=Search&start="
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse( html )
|
15
|
+
hits = html.scan(/<\/b> of [\w\s]*<b>(.*)<\/b> for /)
|
16
|
+
if hits.empty? or hits == nil
|
17
|
+
@totalhits = 0
|
18
|
+
else
|
19
|
+
@totalhits = totalhits(hits[0][0].gsub(",","").to_i)
|
20
|
+
end
|
21
|
+
super html.scan(/<div class=g><span class="b w xsm">\[([A-Z]+)\]<\/span> \
|
22
|
+
<h2 class=r><a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)"|<h2 class=r><a href="\
|
23
|
+
([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)"/)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module ESearchy
|
2
|
+
module SearchEngines
|
3
|
+
class Yahoo < ESearchy::GenericEngine
|
4
|
+
ENGINE = "boss.yahooapis.com"
|
5
|
+
PORT = 80
|
6
|
+
NUM = 50
|
7
|
+
TYPE = 1
|
8
|
+
|
9
|
+
def search
|
10
|
+
@querypath = "/ysearch/web/v1/" + @query +
|
11
|
+
"?appid="+ @appid + "&format=json&count=50" or
|
12
|
+
raise ESearchyMissingAppID, "Missing AppID <Class.appid=>"
|
13
|
+
super
|
14
|
+
end
|
15
|
+
|
16
|
+
def appid=(value)
|
17
|
+
@appid = value
|
18
|
+
end
|
19
|
+
|
20
|
+
def parse(json)
|
21
|
+
doc = JSON.parse(json)
|
22
|
+
hits = doc["ysearchresponse"]["totalhits"].to_i
|
23
|
+
if hits == nil or hits == 0
|
24
|
+
@totalhits = 0
|
25
|
+
else
|
26
|
+
@totalhits = totalhits(hits)
|
27
|
+
end
|
28
|
+
super doc["ysearchresponse"]["resultset_web"]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module ESearchy
|
2
|
+
module SocialEngines
|
3
|
+
class Classmates < ESearchy::GenericEngine
|
4
|
+
ENGINE = "www.google.com"
|
5
|
+
PORT = 80
|
6
|
+
NUM = 100
|
7
|
+
TYPE = 2
|
8
|
+
|
9
|
+
def search
|
10
|
+
@querypath = "/cse?q=site%3Awww.classmates.com+%22work+at+" + CGI.escape(@company) +
|
11
|
+
"%22&hl=en&cof=&num=100&filter=0&safe=off&start=" or raise ESearchyMissingCompany, "Mssing website url Object.company=(value)"
|
12
|
+
super
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse( html )
|
16
|
+
hits = html.scan(/<\/b> of[ about | ]<b>(.*)<\/b> from/)
|
17
|
+
if hits.empty? or hits == nil
|
18
|
+
@totalhits = 0
|
19
|
+
else
|
20
|
+
@totalhits = totalhits(hits[0][0].gsub(",","").to_i)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def crawl_people(html)
|
25
|
+
html.scan(/<a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)" class=l[\sonmousedown="return clk(this.href,'','','res','\d','')"]*>([\w\s]*) \|/).each do |profile|
|
26
|
+
name,last = profile[1].split(" ")
|
27
|
+
@people << [name,last]
|
28
|
+
@results << [[name,last], "P", self.class.to_s.upcase, "N"]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module ESearchy
|
2
|
+
module SocialEngines
|
3
|
+
class GoogleProfiles < ESearchy::GenericEngine
|
4
|
+
ENGINE = "www.google.com"
|
5
|
+
PORT = 80
|
6
|
+
NUM = 100
|
7
|
+
TYPE = 2
|
8
|
+
|
9
|
+
def search
|
10
|
+
@querypath = "/cse?q=site:www.google.com+intitle:%22Google+" +
|
11
|
+
"Profile%22+%22Companies+I%27ve+worked+for%22+%22at+" +
|
12
|
+
CGI.escape(@company) + "%22&hl=en&cof=&num=100&filter=0&safe=off&start=" or
|
13
|
+
raise ESearchyMissingCompany, "Mssing website url Object.company=(value)"
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse( html )
|
18
|
+
#Results <b>1</b> - <b>8</b> of <b>8</b> from <b>www.google.com</b>
|
19
|
+
hits = html.scan(/<\/b> of <b>(.*)<\/b> from /)
|
20
|
+
if hits.empty? or hits == nil
|
21
|
+
@totalhits = 0
|
22
|
+
else
|
23
|
+
@totalhits = totalhits(hits[0][0].gsub(",","").to_i) unless @was_here
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def crawl_people(text)
|
28
|
+
text.scan(/<a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)" class=l[\sonmousedown="return clk(this.href,'','','res','\d','')"]*>([\w\s]*) -/).each do |profile|
|
29
|
+
name,last = profile[1].split(" ")
|
30
|
+
@people << [name,last]
|
31
|
+
@results << [[name,last], "P", self.class.to_s.upcase, "N"]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module ESearchy
|
2
|
+
module SocialEngines
|
3
|
+
class LinkedIn < ESearchy::GenericEngine
|
4
|
+
ENGINE = "www.google.com"
|
5
|
+
PORT = 80
|
6
|
+
NUM = 100
|
7
|
+
TYPE = 2
|
8
|
+
|
9
|
+
def search
|
10
|
+
@querypath = "/cse?q=site%3Awww.linkedin.com/in+%22at+" +
|
11
|
+
CGI.escape(@company) + "%22&hl=en&cof=&num=100&filter=0&safe=off&start=" or
|
12
|
+
raise ESearchyMissingCompany, "Mssing website url Object.company=(value)"
|
13
|
+
super
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse( html )
|
17
|
+
#Results <b>1</b> - <b>8</b> of <b>8</b> from <b>www.google.com</b>
|
18
|
+
hits = html.scan(/<\/b> of [\w\s]*<b>(.*)<\/b> from /)
|
19
|
+
if hits.empty? or hits == nil
|
20
|
+
@totalhits = 0
|
21
|
+
else
|
22
|
+
@totalhits = totalhits(hits[0][0].gsub(",","").to_i)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def crawl_people(text)
|
27
|
+
text.scan(/<a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)" class=l[\sonmousedown="return clk(this.href,'','','res','\d','')"]*>([\w\s]*) -/).each do |profile|
|
28
|
+
name,last = profile[1].split(" ")
|
29
|
+
@people << [name,last]
|
30
|
+
@results << [[name,last], "P", self.class.to_s.upcase, "N"]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
require 'net/https'
|
2
|
+
module ESearchy
|
3
|
+
module SocialEngines
|
4
|
+
class LinkedIn < ESearchy::GenericEngine
|
5
|
+
ENGINE = "www.linkedin.com"
|
6
|
+
PORT = 80
|
7
|
+
NUM = 1
|
8
|
+
TYPE = 2
|
9
|
+
|
10
|
+
def search
|
11
|
+
@querypath = "/search?search=¤tCompany=co&company=" + CGI.escape(@company) +
|
12
|
+
"&proposalType=Y&newnessType=Y&pplSearchOrigin=MDYS&searchLocationType=Y&page_num=" or
|
13
|
+
raise ESearchyMissingCompany, "Mssing website url Object.company=(value)"
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse( html )
|
18
|
+
p html
|
19
|
+
p html.scan(/<p class="summary">[\n\s]+<strong>(.*)<\/strong> results/)#.gsub(/,|./,"")
|
20
|
+
#unless @was_here
|
21
|
+
# @totalhits= totalhits html.scan(/<p class="summary">[\n\s]+<strong>(.*)<\/strong> results/)[0][0].to_i
|
22
|
+
#end
|
23
|
+
end
|
24
|
+
|
25
|
+
def credentials=(c)
|
26
|
+
@user = c[0].to_s
|
27
|
+
@pwd = c[1].to_s
|
28
|
+
LinkedIn.const_set :HEADER, login
|
29
|
+
self.start=(1)
|
30
|
+
end
|
31
|
+
|
32
|
+
def maxhits=(v)
|
33
|
+
super v/10
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
def crawl_people(html)
|
38
|
+
list = html.scan(/title="View profile">[\n\s]+<span class="given-name">(.*)<\/span>\
|
39
|
+
[\n\s]+<span class="family-name">(.*)<\/span>/)
|
40
|
+
@people.concat(list).uniq!
|
41
|
+
list.each { |p| @results << [p, "P", self.class.to_s.upcase, "N"] }
|
42
|
+
end
|
43
|
+
|
44
|
+
def login
|
45
|
+
begin
|
46
|
+
get ENGINE, PORT, "/secure/login?trk=hb_signin", {'User-Agent' => "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.1.5) Gecko/20091102"} do |r|
|
47
|
+
@l_headers = r.to_hash
|
48
|
+
@l_headers.each {|k,v| @l_headers[k] = v.to_s}
|
49
|
+
@csrfToken = r.body.scan(/<input type="hidden" name="csrfToken" value="ajax:(.*)">/)[0][0]
|
50
|
+
puts "------------------------------------------------------------------------------------"
|
51
|
+
puts "------------------------------------------------------------------------------------"
|
52
|
+
p @l_headers
|
53
|
+
p @csrfToken
|
54
|
+
puts "------------------------------------------------------------------------------------"
|
55
|
+
puts "------------------------------------------------------------------------------------"
|
56
|
+
end
|
57
|
+
http = Net::HTTP.new(ENGINE,443)
|
58
|
+
http.use_ssl = true
|
59
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
60
|
+
http.start do |http|
|
61
|
+
body = "csrfToken=ajax:#{@csrfToken}" +
|
62
|
+
"session_key=#{@user}" +
|
63
|
+
"&session_password=#{@pwd}" +
|
64
|
+
"&session_login=Sign+In&session_login=&session_rikey="
|
65
|
+
|
66
|
+
@l_headers['Host'] = "www.linkedin.com"
|
67
|
+
@l_headers['User-Agent'] = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5"
|
68
|
+
@l_headers['Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
69
|
+
@l_headers['Accept-Language'] = "en-us,en;q=0.5"
|
70
|
+
@l_headers['Accept-Charset'] = "ISO-8859-1,utf-8;q=0.7,*;q=0.7"
|
71
|
+
@l_headers['Keep-Alive'] = "300"
|
72
|
+
@l_headers['Connection'] = "keep-alive"
|
73
|
+
@l_headers['Referer'] = "https://www.linkedin.com/secure/login?trk=hb_signin"
|
74
|
+
@l_headers['Cookie'] = "JSESSIONID=\"ajax:5367441617418183976\"; visit=G; bcookie=\"v=1&8231965c-b4b7-48f2-8349-76514ba89b69\"; lang=\"v=2&lang=en&c=\"; NSC_MC_QH_MFP=e242089229a3; __utma=226841088.2037160969.1259078198.1259078198.1259078198.1; __utmb=226841088.2.10.1259078198; __utmc=226841088; __utmz=226841088.1259078198.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmv=226841088.user; leo_auth_token=\"GST:9_t6crYtB4AWStfoqhWQ6LYPKakWfHk_dotQyAHagiRX1HlEvqVt5-:1259081816:56d4aecb2e985d7f8a30d74e758f261ea8b92065\"; NSC_MC_WT_YUSL_IUUQ=e2420f8429a0"
|
75
|
+
@l_headers['Content-Type'] = "application/x-www-form-urlencoded"
|
76
|
+
@l_headers['Content-Length'] = body.size.to_s
|
77
|
+
|
78
|
+
request = Net::HTTP::Post.new("/secure/login", @l_headers)
|
79
|
+
request.body = CGI.escape(body)
|
80
|
+
response = http.request(request)
|
81
|
+
case response
|
82
|
+
when Net::HTTPSuccess, Net::HTTPRedirection
|
83
|
+
puts "------------------------------------------------------------------------------"
|
84
|
+
puts "------------------------------------------------------------------------------"
|
85
|
+
p response.to_hash
|
86
|
+
p response.body
|
87
|
+
puts "-----------------------------------------------------------------------------"
|
88
|
+
puts "-----------------------------------------------------------------------------"
|
89
|
+
return {'Cookie' => response['Set-Cookie'], 'User-Agent' => UserAgent::fetch}
|
90
|
+
else
|
91
|
+
return response.error!
|
92
|
+
end
|
93
|
+
end
|
94
|
+
rescue Net::HTTPFatalError
|
95
|
+
D "Error: Something went wrong while login to LinkedIn.\n\t${$@}"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module ESearchy
|
2
|
+
module SocialEngines
|
3
|
+
class Naymz < ESearchy::GenericEngine
|
4
|
+
ENGINE = "www.google.com"
|
5
|
+
PORT = 80
|
6
|
+
NUM = 100
|
7
|
+
TYPE = 2
|
8
|
+
|
9
|
+
def search
|
10
|
+
@querypath = "/cse?q=site:naymz.com%20%2B%20%22@%20" + CGI.escape(@company) +
|
11
|
+
"%22&hl=en&cof=&num=100&filter=0&safe=off&start="
|
12
|
+
super
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse( html )
|
16
|
+
#</b> of about <b>760</b> from <b>
|
17
|
+
hits = html.scan(/<\/b> of about <b>(.*)<\/b> from/)
|
18
|
+
if hits.empty? or hits == nil
|
19
|
+
@totalhits = 0
|
20
|
+
else
|
21
|
+
@totalhits= totalhits hits[0][0].gsub(",","").to_i unless @was_here
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def crawl_people(html)
|
26
|
+
html.scan(/<a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)" class=l[\sonmousedown="return clk(this.href,'','','res','\d','')"]*>([\w\s]*) -/).each do |profile|
|
27
|
+
person = profile[1].split(" ").delete_if do
|
28
|
+
|x| x =~ /mr.|mr|ms.|ms|phd.|dr.|dr|phd|phd./i
|
29
|
+
end
|
30
|
+
@people << person
|
31
|
+
@results << [person, "P", self.class.to_s.upcase, "N"]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'base64'
|
2
|
+
|
3
|
+
module ESearchy
|
4
|
+
class Bugmenot
|
5
|
+
def self.fetch(domain = "www.linkedin.com")
|
6
|
+
begin
|
7
|
+
url = Net::HTTP.get URI.parse("http://www.bugmenot.com/view/#{domain}")
|
8
|
+
key = ( url.scan(/var key =(.*);/)[0][0].to_i + 112 ) / 12
|
9
|
+
|
10
|
+
user, pass = url.scan(/tr><th>Username <\/th><td><script>d\('(.*)'\);<\/script><\/td><\/tr>
|
11
|
+
[\n\s]+<tr><th>Password <\/th><td><script>d\('(.*)'\);<\/script><\/td><\/tr>/)[0]
|
12
|
+
user = decode(user,key)
|
13
|
+
pass = decode(pass,key)
|
14
|
+
return [user, pass]
|
15
|
+
rescue
|
16
|
+
return [nil,nil]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
def decode(input, offset)
|
22
|
+
# thanks tlrobinson @ github
|
23
|
+
input.unpack("m*")[0][4..-1].unpack("C*").map{|c| c - offset }.pack("C*")
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,267 @@
|
|
1
|
+
module ESearchy
|
2
|
+
class PageTextReceiver
|
3
|
+
attr_accessor :content
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@content = []
|
7
|
+
end
|
8
|
+
|
9
|
+
# Called when page parsing starts
|
10
|
+
def begin_page(arg = nil)
|
11
|
+
@content << ""
|
12
|
+
end
|
13
|
+
|
14
|
+
# record text that is drawn on the page
|
15
|
+
def show_text(string, *params)
|
16
|
+
@content.last << string.strip
|
17
|
+
end
|
18
|
+
|
19
|
+
# there's a few text callbacks, so make sure we process them all
|
20
|
+
alias :super_show_text :show_text
|
21
|
+
alias :move_to_next_line_and_show_text :show_text
|
22
|
+
alias :set_spacing_next_line_show_text :show_text
|
23
|
+
|
24
|
+
# this final text callback takes slightly different arguments
|
25
|
+
def show_text_with_positioning(*params)
|
26
|
+
params = params.first
|
27
|
+
params.each { |str| show_text(str) if str.kind_of?(String)}
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class Docs
|
32
|
+
case RUBY_PLATFORM
|
33
|
+
when /mingw|mswin/
|
34
|
+
TEMP = "C:\\WINDOWS\\Temp\\"
|
35
|
+
else
|
36
|
+
TEMP = "/tmp/"
|
37
|
+
end
|
38
|
+
attr_reader :documents, :emails, :results
|
39
|
+
|
40
|
+
def initialize(doc=nil, size = 10485760)
|
41
|
+
case doc
|
42
|
+
when Array
|
43
|
+
@@documents = Queue.new
|
44
|
+
self.merge doc
|
45
|
+
else
|
46
|
+
@@documents = doc || Queue.new
|
47
|
+
end
|
48
|
+
@size = size
|
49
|
+
@emails = []
|
50
|
+
@results = []
|
51
|
+
@lock = Mutex.new
|
52
|
+
end
|
53
|
+
|
54
|
+
## Class methods
|
55
|
+
def self.search(doc)
|
56
|
+
self.new(doc)
|
57
|
+
search(doc)
|
58
|
+
end
|
59
|
+
|
60
|
+
def merge(array)
|
61
|
+
array.each {|a| push(a) }
|
62
|
+
end
|
63
|
+
|
64
|
+
def self.push(doc)
|
65
|
+
push(doc)
|
66
|
+
end
|
67
|
+
|
68
|
+
def push(doc)
|
69
|
+
@@documents.push(doc)
|
70
|
+
end
|
71
|
+
|
72
|
+
def local_search
|
73
|
+
threads = []
|
74
|
+
while @documents.size >=1
|
75
|
+
threads << Thread.new do
|
76
|
+
doc = @@documents.pop
|
77
|
+
detect_type(doc.split(".")[-1], doc)
|
78
|
+
end
|
79
|
+
threads.each {|t| t.join } if @threads != nil
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def search
|
84
|
+
threads = []
|
85
|
+
while @@documents.size >=1
|
86
|
+
threads << Thread.new do
|
87
|
+
document = @@documents.pop
|
88
|
+
url = document[0].gsub(' ','+')
|
89
|
+
format = document[1]
|
90
|
+
if data = download(url)
|
91
|
+
name = save_to_disk(url, format, data)
|
92
|
+
detect_type(format,name)
|
93
|
+
remove_from_disk(name)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
threads.each {|t| t.join } if threads != nil
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def detect_type(format,name)
|
103
|
+
case format
|
104
|
+
when /.pdf/
|
105
|
+
pdf(name)
|
106
|
+
when /.doc/
|
107
|
+
doc(name)
|
108
|
+
when /txt|rtf|ans/
|
109
|
+
plain(name)
|
110
|
+
when /.docx|.xlsx|.pptx|.odt|.odp|.ods|.odb/
|
111
|
+
xml(name)
|
112
|
+
else
|
113
|
+
D "Error: Not currently parsing #{format}"
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def download(doc)
|
118
|
+
web = URI.parse(doc)
|
119
|
+
begin
|
120
|
+
http = Net::HTTP.new(web.host,80)
|
121
|
+
http.start do |http|
|
122
|
+
request = Net::HTTP::Head.new("#{web.path}#{web.query}")
|
123
|
+
response = http.request(request)
|
124
|
+
if response.content_length < @size
|
125
|
+
D "Downloading document: #{web.to_s}\n"
|
126
|
+
request = Net::HTTP::Get.new("#{web.path}#{web.query}")
|
127
|
+
response = http.request(request)
|
128
|
+
case response
|
129
|
+
when Net::HTTPSuccess, Net::HTTPRedirection
|
130
|
+
return response.body
|
131
|
+
else
|
132
|
+
return response.error!
|
133
|
+
end
|
134
|
+
else
|
135
|
+
D "Debug: Skipping #{web.to_s}. bigger than 10MB.\n"
|
136
|
+
end
|
137
|
+
end
|
138
|
+
rescue Net::HTTPFatalError
|
139
|
+
D "Error: HTTPFatalError - Unable to download.\n"
|
140
|
+
rescue Net::HTTPServerException
|
141
|
+
D "Error: Not longer there. 404 Not Found.\n"
|
142
|
+
rescue
|
143
|
+
D "Error: < .. SocketError .. >\n"
|
144
|
+
end
|
145
|
+
nil
|
146
|
+
end
|
147
|
+
|
148
|
+
def save_to_disk(url, format, data)
|
149
|
+
name = TEMP + "#{hash_url(url)}" + format
|
150
|
+
open(name, "wb") { |file| file.write(data) }
|
151
|
+
name
|
152
|
+
end
|
153
|
+
|
154
|
+
def remove_from_disk(name)
|
155
|
+
`rm "#{name}"`
|
156
|
+
end
|
157
|
+
|
158
|
+
def hash_url(url)
|
159
|
+
Digest::SHA2.hexdigest("#{Time.now.to_f}--#{url}")
|
160
|
+
end
|
161
|
+
|
162
|
+
def pdf(name)
|
163
|
+
begin
|
164
|
+
receiver = PageTextReceiver.new
|
165
|
+
pdf = PDF::Reader.file(name, receiver)
|
166
|
+
search_emails(receiver.content.inspect)
|
167
|
+
rescue PDF::Reader::UnsupportedFeatureError
|
168
|
+
D "Error: Encrypted PDF - Unable to parse.\n"
|
169
|
+
rescue PDF::Reader::MalformedPDFError
|
170
|
+
D "Error: Malformed PDF - Unable to parse.\n"
|
171
|
+
rescue
|
172
|
+
D "Error: Unknown - Unable to parse.\n"
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def doc(name)
|
177
|
+
if RUBY_PLATFORM =~ /mingw|mswin/
|
178
|
+
begin
|
179
|
+
word(name)
|
180
|
+
rescue
|
181
|
+
antiword(name)
|
182
|
+
end
|
183
|
+
elsif RUBY_PLATFORM =~ /linux|darwin/
|
184
|
+
begin
|
185
|
+
antiword(name)
|
186
|
+
rescue
|
187
|
+
D "Error: Unable to parse .doc"
|
188
|
+
end
|
189
|
+
else
|
190
|
+
D "Error: Platform not supported."
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def word(name)
|
195
|
+
word = WIN32OLE.new('word.application')
|
196
|
+
word.documents.open(name)
|
197
|
+
word.selection.wholestory
|
198
|
+
search_emails(word.selection.text.chomp)
|
199
|
+
word.activedocument.close( false )
|
200
|
+
word.quit
|
201
|
+
end
|
202
|
+
|
203
|
+
def antiword(name)
|
204
|
+
case RUBY_PLATFORM
|
205
|
+
when /mingw|mswin/
|
206
|
+
if File.exists?("C:\\antiword\\antiword.exe")
|
207
|
+
search_emails(`C:\\antiword\\antiword.exe "#{name}" -f -s`)
|
208
|
+
end
|
209
|
+
when /linux|darwin/
|
210
|
+
if File.exists?("/usr/bin/antiword") or
|
211
|
+
File.exists?("/usr/local/bin/antiword") or
|
212
|
+
File.exists?("/opt/local/bin/antiword")
|
213
|
+
search_emails(`antiword "#{name}" -f -s`)
|
214
|
+
end
|
215
|
+
else
|
216
|
+
# This G h e t t o but, for now it works on emails
|
217
|
+
# that do not contain Capital letters:)
|
218
|
+
D "Debug: Using the Ghetto way."
|
219
|
+
search_emails(File.open(name).readlines[0..19].to_s)
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
def plain(name)
|
224
|
+
search_emails(File.open(name).readlines.to_s)
|
225
|
+
end
|
226
|
+
|
227
|
+
def xml(name)
|
228
|
+
begin
|
229
|
+
Zip::ZipFile.open(name) do |zip|
|
230
|
+
text = z.entries.each { |e| zip.file.read(e.name) if e.name =~ /.xml$/}
|
231
|
+
search_emails(text)
|
232
|
+
end
|
233
|
+
rescue
|
234
|
+
D "Error: Unable to parse .#{name.scan(/\..[a-z]*$/)}\n"
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
def search_emails(text)
|
239
|
+
list = text.scan(/[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*_at_\
|
240
|
+
(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|\
|
241
|
+
[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*\sat\s(?:[a-z0-9](?:[a-z0-9-]\
|
242
|
+
*[a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|[a-z0-9!#$&'*+=?^_`{|}~-]+\
|
243
|
+
(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|\
|
244
|
+
[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*\s@\s(?:[a-z0-9](?:[a-z0-9-]*\
|
245
|
+
[a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\sdot\s[a-z0-9!#$&'*+=?^_`\
|
246
|
+
{|}~-]+)*\sat\s(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\sdot\s)+[a-z](?:[a-z-]*[a-z])??/i)
|
247
|
+
@lock.synchronize do
|
248
|
+
#print_(list)
|
249
|
+
c_list = fix(list)
|
250
|
+
@emails.concat(c_list).uniq!
|
251
|
+
c_list.zip do |e|
|
252
|
+
@results << [e[0], "E", self.class.to_s.upcase,
|
253
|
+
e[0].match(/#{CGI.unescape(ESearchy::Search.query).gsub("@","").split('.')[0]}/) ? "T" : "F"]
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
def fix(list)
|
259
|
+
list.each do |e|
|
260
|
+
e.gsub!(" at ","@")
|
261
|
+
e.gsub!("_at_","@")
|
262
|
+
e.gsub!(" dot ",".")
|
263
|
+
e.gsub!(/[+0-9]{0,3}[0-9()]{3,5}[-]{0,1}[0-9]{3,4}[-]{0,1}[0-9]{3,5}/,"")
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
267
|
+
end
|