gmail-scraper 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/conv_summary.rb +50 -0
- data/lib/conversation.rb +27 -0
- data/lib/email.rb +56 -0
- data/lib/gmail.rb +122 -0
- data/spec/basic_spec.rb +58 -0
- metadata +68 -0
data/lib/conv_summary.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
class DraftException<Exception
|
2
|
+
end
|
3
|
+
|
4
|
+
class ConvSummary
|
5
|
+
|
6
|
+
attr_accessor :uid, :subject, :nb_emails, :updated_at, :tags, :url
|
7
|
+
|
8
|
+
def self.create_from_html(html)
|
9
|
+
|
10
|
+
conv=self.new
|
11
|
+
data=html.content.split("\n\n")
|
12
|
+
# extract link
|
13
|
+
link=html.search(".//a").first.attributes['href'].to_s
|
14
|
+
|
15
|
+
# raise an exception if there is only a draft
|
16
|
+
raise DraftException.new if (link[/draft=.*/])
|
17
|
+
|
18
|
+
conv.url=link
|
19
|
+
conv.uid=link[/th=.*/].gsub("th=", "")
|
20
|
+
|
21
|
+
#if (data.size>1)
|
22
|
+
|
23
|
+
# extract nb_emails
|
24
|
+
num=data[1][/\((\d)+\)/]
|
25
|
+
conv.nb_emails=(num.nil?)? 1 : num.gsub(/\(|\)/,"").to_i
|
26
|
+
|
27
|
+
# remove drafts from the number of emails
|
28
|
+
unless data[1][/, Draft( \((\d)\))*/].nil?
|
29
|
+
nb_drafts=/, Draft( \((\d)\))*/.match(data[1])[1]
|
30
|
+
conv.nb_emails-=(nb_drafts.nil?)? 1: nb_drafts.gsub(/\(|\)/,"").to_i
|
31
|
+
end
|
32
|
+
|
33
|
+
conv.nb_emails-=1 unless data[1][/, Draft/].nil?
|
34
|
+
|
35
|
+
# extract nb_draft
|
36
|
+
num=data[1][/\((\d)+\)/]
|
37
|
+
conv.nb_emails=(num.nil?)? 1 : num.gsub(/\(|\)/,"").to_i
|
38
|
+
# extract labels
|
39
|
+
conv.tags=data[3].split(",").collect {|s| s.strip.downcase }
|
40
|
+
|
41
|
+
# extract subject
|
42
|
+
conv.subject=data[4]
|
43
|
+
#else
|
44
|
+
|
45
|
+
# extract subject
|
46
|
+
# conv.subject=data[2]
|
47
|
+
#end
|
48
|
+
conv
|
49
|
+
end
|
50
|
+
end
|
data/lib/conversation.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'conv_summary.rb'
|
2
|
+
class Conversation<ConvSummary
|
3
|
+
|
4
|
+
attr_accessor :created_at, :emails
|
5
|
+
|
6
|
+
def initialize(conv_summary)
|
7
|
+
@subject=conv_summary.subject
|
8
|
+
@nb_emails=conv_summary.nb_emails
|
9
|
+
@tags=conv_summary.tags
|
10
|
+
@emails=[]
|
11
|
+
@uid=conv_summary.uid
|
12
|
+
@url=conv_summary.url
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
def add_email(email)
|
17
|
+
@emails<<email
|
18
|
+
@created_at=email.created_at if (@created_at.nil? || @created_at>email.created_at)
|
19
|
+
@updated_at=email.created_at if (@updated_at.nil? || @updated_at<email.created_at)
|
20
|
+
email.subject=@subject unless @subject.nil?
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
def sort_emails
|
25
|
+
@emails.sort
|
26
|
+
end
|
27
|
+
end
|
data/lib/email.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
class Email
|
2
|
+
attr_accessor :uid, :sender, :receivers, :created_at, :text, :subject
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
@receivers=[]
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.create_from_html(html)
|
9
|
+
|
10
|
+
email=Email.new
|
11
|
+
|
12
|
+
tr=html.search(".//tr/td")
|
13
|
+
|
14
|
+
# sender
|
15
|
+
sender=tr[0].content.chomp.strip.gsub("/n", "")
|
16
|
+
sender=emails_to_array(sender)
|
17
|
+
email.sender=sender[0].downcase
|
18
|
+
|
19
|
+
# uid
|
20
|
+
#email.uid=conv.uid
|
21
|
+
tr[0].search(".//a").each{|link|
|
22
|
+
link=link.attributes['href'].to_s
|
23
|
+
email.uid=link[/#.*/].gsub("#", "")
|
24
|
+
}
|
25
|
+
|
26
|
+
# date
|
27
|
+
date=tr[1].content.chomp.strip.gsub("/n", "")
|
28
|
+
email.created_at=DateTime.strptime(date, '%a, %b %d, %Y at %I:%M %p')
|
29
|
+
|
30
|
+
|
31
|
+
# extract TO
|
32
|
+
receivers_to=tr[2].content.chomp.strip.gsub("/n", "")
|
33
|
+
email.receivers<<emails_to_array(receivers_to)
|
34
|
+
|
35
|
+
# extract CC
|
36
|
+
receivers_cc=tr[3].content.chomp.strip.gsub("/n", "")
|
37
|
+
email.receivers<<emails_to_array(receivers_cc)
|
38
|
+
# #puts "receiver_cc:#{receivers_cc}"
|
39
|
+
|
40
|
+
# extract Body
|
41
|
+
content=""
|
42
|
+
content = tr[4].content unless tr[4].nil?
|
43
|
+
content = tr[5].content if content.chomp.strip.gsub("/n", "").index("Reply |")==0
|
44
|
+
email.text=content.chomp.strip.gsub("/n", "")
|
45
|
+
|
46
|
+
return email
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
def self.emails_to_array(txt)
|
51
|
+
reg = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
|
52
|
+
txt.scan(reg).uniq
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
data/lib/gmail.rb
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mechanize'
|
3
|
+
require 'conversation.rb'
|
4
|
+
require 'email.rb'
|
5
|
+
|
6
|
+
|
7
|
+
class ThreadNotFoundException<Exception
|
8
|
+
end
|
9
|
+
|
10
|
+
class Gmail
|
11
|
+
|
12
|
+
CONV_PER_PAGE=50
|
13
|
+
XPATH_CONV_IN_LIST="//tr[@bgcolor='#E8EEF7'] | //tr[@bgcolor='#ffffff']"
|
14
|
+
XPATH_EMAIL_IN_CONV=".//table[@bgcolor='#efefef' and @border='0']"
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
def initialize(email, pwd)
|
19
|
+
@email=email
|
20
|
+
@pwd=pwd
|
21
|
+
@error=false
|
22
|
+
end
|
23
|
+
|
24
|
+
def connect
|
25
|
+
base_url="http://www.gmail.com"
|
26
|
+
@agent = WWW::Mechanize.new
|
27
|
+
base_page = @agent.get base_url
|
28
|
+
login_form = base_page.forms.first
|
29
|
+
login_form.Email = @email
|
30
|
+
login_form.Passwd = @pwd
|
31
|
+
@agent.submit(login_form)
|
32
|
+
page= @agent.get "http://mail.google.com/mail/?ui=html&zy=a&s=a"
|
33
|
+
|
34
|
+
# bad(but working) method to detect the connection
|
35
|
+
connected=!(page.title.gsub("\n", "").strip=="Gmail: Email from Google")
|
36
|
+
|
37
|
+
return connected
|
38
|
+
end
|
39
|
+
|
40
|
+
# Scrap the list of summaries of conversations
|
41
|
+
|
42
|
+
def list(conv_start=0, conv_end=nil)
|
43
|
+
# display the list of emails
|
44
|
+
summary_as_html(conv_start, conv_end){|html_conv, i|
|
45
|
+
begin
|
46
|
+
c=ConvSummary.create_from_html(html_conv)
|
47
|
+
yield(c)
|
48
|
+
rescue DraftException
|
49
|
+
puts "Skiping a draft"
|
50
|
+
end
|
51
|
+
|
52
|
+
}
|
53
|
+
end
|
54
|
+
|
55
|
+
# Scrap the emails from the url of a given conversation
|
56
|
+
def fetch_conversation(conv_summary)
|
57
|
+
conv=Conversation.new(conv_summary)
|
58
|
+
conversation_as_html(conv.url){|html|
|
59
|
+
email=Email.create_from_html(html)
|
60
|
+
conv.add_email(email)
|
61
|
+
}
|
62
|
+
return conv
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def summary_as_html(conv_start, conv_end)
|
68
|
+
|
69
|
+
page_index=conv_start/CONV_PER_PAGE
|
70
|
+
|
71
|
+
while (!@error)
|
72
|
+
|
73
|
+
# get the page of threads
|
74
|
+
url="?s=a&st=#{page_index*CONV_PER_PAGE}"
|
75
|
+
puts "fetching page: #{url}"
|
76
|
+
page_list=@agent.get url
|
77
|
+
|
78
|
+
# delegate each thread (html format)
|
79
|
+
|
80
|
+
page_list.search(XPATH_CONV_IN_LIST).each_with_index{|conv, i|
|
81
|
+
conv_index=page_index*CONV_PER_PAGE+i
|
82
|
+
puts "conversation_index: #{conv_index}"
|
83
|
+
|
84
|
+
if (((!conv_end.nil?) && (conv_index>conv_end)) || @error)
|
85
|
+
return
|
86
|
+
end
|
87
|
+
|
88
|
+
if (conv_index>=conv_start)
|
89
|
+
yield(conv)
|
90
|
+
end
|
91
|
+
}
|
92
|
+
|
93
|
+
# is there a next page?
|
94
|
+
if (page_list.search("//a[@href='?s=a&st=#{((page_index+1)*CONV_PER_PAGE)}']").size>0)
|
95
|
+
page_index+=1
|
96
|
+
else
|
97
|
+
@error=true
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Scraping emails from the HTML UI of a thread/conversation
|
103
|
+
# input a thread object or an url
|
104
|
+
# output HTML tables
|
105
|
+
def conversation_as_html(url=nil)
|
106
|
+
|
107
|
+
url+="&d=e"
|
108
|
+
|
109
|
+
# request the url of the thread
|
110
|
+
list_emails=@agent.get(url).search(XPATH_EMAIL_IN_CONV)
|
111
|
+
if list_emails.empty?
|
112
|
+
raise ThreadNotFoundException.new
|
113
|
+
else
|
114
|
+
|
115
|
+
list_emails.each{|email_html|
|
116
|
+
yield(email_html)
|
117
|
+
}
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
data/spec/basic_spec.rb
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require "spec"
|
3
|
+
require '../lib/gmail.rb'
|
4
|
+
|
5
|
+
describe Gmail do
|
6
|
+
|
7
|
+
before(:all) do
|
8
|
+
@login="loging"
|
9
|
+
@pass="password"
|
10
|
+
@gmail=Gmail.new(@login, @pass)
|
11
|
+
@gmail.connect
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
it "should not be connected with wrong login/password" do
|
16
|
+
gmail=Gmail.new("bar", "foo")
|
17
|
+
gmail.connect.should==false
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
it "should be connected with right login/password" do
|
22
|
+
@gmail.connect.should==true
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
it "should be able to extract a summary of the first 50 conversations" do
|
27
|
+
email_start=0
|
28
|
+
email_end=1
|
29
|
+
@gmail.connect
|
30
|
+
i=0
|
31
|
+
@gmail.list(email_start, email_end) { |thread_summary|
|
32
|
+
thread_summary.subject.should_not==nil
|
33
|
+
thread_summary.nb_emails.should>=1
|
34
|
+
thread_summary.uid.should_not==nil
|
35
|
+
thread_summary.tags.size.should>=0
|
36
|
+
|
37
|
+
i+=1
|
38
|
+
}
|
39
|
+
i.should==(email_end-email_start+1)
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should be able only to extract the correct interval of conversations" do
|
43
|
+
i=0
|
44
|
+
@gmail.list(1,2){ |conv_summary|
|
45
|
+
conv=@gmail.fetch_conversation(conv_summary)
|
46
|
+
i+=1
|
47
|
+
}
|
48
|
+
i.should==2
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should be able to fetch the correct number of emails in a conversations" do
|
52
|
+
@gmail.list(500,551){ |conv_summary|
|
53
|
+
conversation=@gmail.fetch_conversation(conv_summary)
|
54
|
+
conversation.emails.size.should==conversation.nb_emails
|
55
|
+
}
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
metadata
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gmail-scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "0.1"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nicolas Maisonneuve
|
8
|
+
autorequire: gmail-scraper
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-04 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: mechanize
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.9.3
|
24
|
+
version:
|
25
|
+
description:
|
26
|
+
email: n.maisonneuve@gmail.com
|
27
|
+
executables: []
|
28
|
+
|
29
|
+
extensions: []
|
30
|
+
|
31
|
+
extra_rdoc_files: []
|
32
|
+
|
33
|
+
files:
|
34
|
+
- lib/conversation.rb
|
35
|
+
- lib/conv_summary.rb
|
36
|
+
- lib/email.rb
|
37
|
+
- lib/gmail.rb
|
38
|
+
- spec/basic_spec.rb
|
39
|
+
has_rdoc: true
|
40
|
+
homepage: http://github.com/nmaisonneuve/gmail-scraper-gem
|
41
|
+
licenses: []
|
42
|
+
|
43
|
+
post_install_message:
|
44
|
+
rdoc_options: []
|
45
|
+
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: "0"
|
53
|
+
version:
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: "0"
|
59
|
+
version:
|
60
|
+
requirements: []
|
61
|
+
|
62
|
+
rubyforge_project:
|
63
|
+
rubygems_version: 1.3.5
|
64
|
+
signing_key:
|
65
|
+
specification_version: 3
|
66
|
+
summary: Scrap Gmail's emails from its HTML Version.
|
67
|
+
test_files: []
|
68
|
+
|