gmail-scraper 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/conv_summary.rb +50 -0
- data/lib/conversation.rb +27 -0
- data/lib/email.rb +56 -0
- data/lib/gmail.rb +122 -0
- data/spec/basic_spec.rb +58 -0
- metadata +68 -0
data/lib/conv_summary.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
class DraftException<Exception
|
2
|
+
end
|
3
|
+
|
4
|
+
class ConvSummary
|
5
|
+
|
6
|
+
attr_accessor :uid, :subject, :nb_emails, :updated_at, :tags, :url
|
7
|
+
|
8
|
+
def self.create_from_html(html)
|
9
|
+
|
10
|
+
conv=self.new
|
11
|
+
data=html.content.split("\n\n")
|
12
|
+
# extract link
|
13
|
+
link=html.search(".//a").first.attributes['href'].to_s
|
14
|
+
|
15
|
+
# raise an exception if there is only a draft
|
16
|
+
raise DraftException.new if (link[/draft=.*/])
|
17
|
+
|
18
|
+
conv.url=link
|
19
|
+
conv.uid=link[/th=.*/].gsub("th=", "")
|
20
|
+
|
21
|
+
#if (data.size>1)
|
22
|
+
|
23
|
+
# extract nb_emails
|
24
|
+
num=data[1][/\((\d)+\)/]
|
25
|
+
conv.nb_emails=(num.nil?)? 1 : num.gsub(/\(|\)/,"").to_i
|
26
|
+
|
27
|
+
# remove drafts from the number of emails
|
28
|
+
unless data[1][/, Draft( \((\d)\))*/].nil?
|
29
|
+
nb_drafts=/, Draft( \((\d)\))*/.match(data[1])[1]
|
30
|
+
conv.nb_emails-=(nb_drafts.nil?)? 1: nb_drafts.gsub(/\(|\)/,"").to_i
|
31
|
+
end
|
32
|
+
|
33
|
+
conv.nb_emails-=1 unless data[1][/, Draft/].nil?
|
34
|
+
|
35
|
+
# extract nb_draft
|
36
|
+
num=data[1][/\((\d)+\)/]
|
37
|
+
conv.nb_emails=(num.nil?)? 1 : num.gsub(/\(|\)/,"").to_i
|
38
|
+
# extract labels
|
39
|
+
conv.tags=data[3].split(",").collect {|s| s.strip.downcase }
|
40
|
+
|
41
|
+
# extract subject
|
42
|
+
conv.subject=data[4]
|
43
|
+
#else
|
44
|
+
|
45
|
+
# extract subject
|
46
|
+
# conv.subject=data[2]
|
47
|
+
#end
|
48
|
+
conv
|
49
|
+
end
|
50
|
+
end
|
data/lib/conversation.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'conv_summary.rb'
|
2
|
+
class Conversation<ConvSummary
|
3
|
+
|
4
|
+
attr_accessor :created_at, :emails
|
5
|
+
|
6
|
+
def initialize(conv_summary)
|
7
|
+
@subject=conv_summary.subject
|
8
|
+
@nb_emails=conv_summary.nb_emails
|
9
|
+
@tags=conv_summary.tags
|
10
|
+
@emails=[]
|
11
|
+
@uid=conv_summary.uid
|
12
|
+
@url=conv_summary.url
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
def add_email(email)
|
17
|
+
@emails<<email
|
18
|
+
@created_at=email.created_at if (@created_at.nil? || @created_at>email.created_at)
|
19
|
+
@updated_at=email.created_at if (@updated_at.nil? || @updated_at<email.created_at)
|
20
|
+
email.subject=@subject unless @subject.nil?
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
def sort_emails
|
25
|
+
@emails.sort
|
26
|
+
end
|
27
|
+
end
|
data/lib/email.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
class Email
|
2
|
+
attr_accessor :uid, :sender, :receivers, :created_at, :text, :subject
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
@receivers=[]
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.create_from_html(html)
|
9
|
+
|
10
|
+
email=Email.new
|
11
|
+
|
12
|
+
tr=html.search(".//tr/td")
|
13
|
+
|
14
|
+
# sender
|
15
|
+
sender=tr[0].content.chomp.strip.gsub("/n", "")
|
16
|
+
sender=emails_to_array(sender)
|
17
|
+
email.sender=sender[0].downcase
|
18
|
+
|
19
|
+
# uid
|
20
|
+
#email.uid=conv.uid
|
21
|
+
tr[0].search(".//a").each{|link|
|
22
|
+
link=link.attributes['href'].to_s
|
23
|
+
email.uid=link[/#.*/].gsub("#", "")
|
24
|
+
}
|
25
|
+
|
26
|
+
# date
|
27
|
+
date=tr[1].content.chomp.strip.gsub("/n", "")
|
28
|
+
email.created_at=DateTime.strptime(date, '%a, %b %d, %Y at %I:%M %p')
|
29
|
+
|
30
|
+
|
31
|
+
# extract TO
|
32
|
+
receivers_to=tr[2].content.chomp.strip.gsub("/n", "")
|
33
|
+
email.receivers<<emails_to_array(receivers_to)
|
34
|
+
|
35
|
+
# extract CC
|
36
|
+
receivers_cc=tr[3].content.chomp.strip.gsub("/n", "")
|
37
|
+
email.receivers<<emails_to_array(receivers_cc)
|
38
|
+
# #puts "receiver_cc:#{receivers_cc}"
|
39
|
+
|
40
|
+
# extract Body
|
41
|
+
content=""
|
42
|
+
content = tr[4].content unless tr[4].nil?
|
43
|
+
content = tr[5].content if content.chomp.strip.gsub("/n", "").index("Reply |")==0
|
44
|
+
email.text=content.chomp.strip.gsub("/n", "")
|
45
|
+
|
46
|
+
return email
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
def self.emails_to_array(txt)
|
51
|
+
reg = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
|
52
|
+
txt.scan(reg).uniq
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
data/lib/gmail.rb
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mechanize'
|
3
|
+
require 'conversation.rb'
|
4
|
+
require 'email.rb'
|
5
|
+
|
6
|
+
|
7
|
+
class ThreadNotFoundException<Exception
|
8
|
+
end
|
9
|
+
|
10
|
+
class Gmail
|
11
|
+
|
12
|
+
CONV_PER_PAGE=50
|
13
|
+
XPATH_CONV_IN_LIST="//tr[@bgcolor='#E8EEF7'] | //tr[@bgcolor='#ffffff']"
|
14
|
+
XPATH_EMAIL_IN_CONV=".//table[@bgcolor='#efefef' and @border='0']"
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
def initialize(email, pwd)
|
19
|
+
@email=email
|
20
|
+
@pwd=pwd
|
21
|
+
@error=false
|
22
|
+
end
|
23
|
+
|
24
|
+
def connect
|
25
|
+
base_url="http://www.gmail.com"
|
26
|
+
@agent = WWW::Mechanize.new
|
27
|
+
base_page = @agent.get base_url
|
28
|
+
login_form = base_page.forms.first
|
29
|
+
login_form.Email = @email
|
30
|
+
login_form.Passwd = @pwd
|
31
|
+
@agent.submit(login_form)
|
32
|
+
page= @agent.get "http://mail.google.com/mail/?ui=html&zy=a&s=a"
|
33
|
+
|
34
|
+
# bad(but working) method to detect the connection
|
35
|
+
connected=!(page.title.gsub("\n", "").strip=="Gmail: Email from Google")
|
36
|
+
|
37
|
+
return connected
|
38
|
+
end
|
39
|
+
|
40
|
+
# Scrap the list of summaries of conversations
|
41
|
+
|
42
|
+
def list(conv_start=0, conv_end=nil)
|
43
|
+
# display the list of emails
|
44
|
+
summary_as_html(conv_start, conv_end){|html_conv, i|
|
45
|
+
begin
|
46
|
+
c=ConvSummary.create_from_html(html_conv)
|
47
|
+
yield(c)
|
48
|
+
rescue DraftException
|
49
|
+
puts "Skiping a draft"
|
50
|
+
end
|
51
|
+
|
52
|
+
}
|
53
|
+
end
|
54
|
+
|
55
|
+
# Scrap the emails from the url of a given conversation
|
56
|
+
def fetch_conversation(conv_summary)
|
57
|
+
conv=Conversation.new(conv_summary)
|
58
|
+
conversation_as_html(conv.url){|html|
|
59
|
+
email=Email.create_from_html(html)
|
60
|
+
conv.add_email(email)
|
61
|
+
}
|
62
|
+
return conv
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def summary_as_html(conv_start, conv_end)
|
68
|
+
|
69
|
+
page_index=conv_start/CONV_PER_PAGE
|
70
|
+
|
71
|
+
while (!@error)
|
72
|
+
|
73
|
+
# get the page of threads
|
74
|
+
url="?s=a&st=#{page_index*CONV_PER_PAGE}"
|
75
|
+
puts "fetching page: #{url}"
|
76
|
+
page_list=@agent.get url
|
77
|
+
|
78
|
+
# delegate each thread (html format)
|
79
|
+
|
80
|
+
page_list.search(XPATH_CONV_IN_LIST).each_with_index{|conv, i|
|
81
|
+
conv_index=page_index*CONV_PER_PAGE+i
|
82
|
+
puts "conversation_index: #{conv_index}"
|
83
|
+
|
84
|
+
if (((!conv_end.nil?) && (conv_index>conv_end)) || @error)
|
85
|
+
return
|
86
|
+
end
|
87
|
+
|
88
|
+
if (conv_index>=conv_start)
|
89
|
+
yield(conv)
|
90
|
+
end
|
91
|
+
}
|
92
|
+
|
93
|
+
# is there a next page?
|
94
|
+
if (page_list.search("//a[@href='?s=a&st=#{((page_index+1)*CONV_PER_PAGE)}']").size>0)
|
95
|
+
page_index+=1
|
96
|
+
else
|
97
|
+
@error=true
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Scraping emails from the HTML UI of a thread/conversation
|
103
|
+
# input a thread object or an url
|
104
|
+
# output HTML tables
|
105
|
+
def conversation_as_html(url=nil)
|
106
|
+
|
107
|
+
url+="&d=e"
|
108
|
+
|
109
|
+
# request the url of the thread
|
110
|
+
list_emails=@agent.get(url).search(XPATH_EMAIL_IN_CONV)
|
111
|
+
if list_emails.empty?
|
112
|
+
raise ThreadNotFoundException.new
|
113
|
+
else
|
114
|
+
|
115
|
+
list_emails.each{|email_html|
|
116
|
+
yield(email_html)
|
117
|
+
}
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
data/spec/basic_spec.rb
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require "spec"
|
3
|
+
require '../lib/gmail.rb'
|
4
|
+
|
5
|
+
describe Gmail do
|
6
|
+
|
7
|
+
before(:all) do
|
8
|
+
@login="loging"
|
9
|
+
@pass="password"
|
10
|
+
@gmail=Gmail.new(@login, @pass)
|
11
|
+
@gmail.connect
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
it "should not be connected with wrong login/password" do
|
16
|
+
gmail=Gmail.new("bar", "foo")
|
17
|
+
gmail.connect.should==false
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
it "should be connected with right login/password" do
|
22
|
+
@gmail.connect.should==true
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
it "should be able to extract a summary of the first 50 conversations" do
|
27
|
+
email_start=0
|
28
|
+
email_end=1
|
29
|
+
@gmail.connect
|
30
|
+
i=0
|
31
|
+
@gmail.list(email_start, email_end) { |thread_summary|
|
32
|
+
thread_summary.subject.should_not==nil
|
33
|
+
thread_summary.nb_emails.should>=1
|
34
|
+
thread_summary.uid.should_not==nil
|
35
|
+
thread_summary.tags.size.should>=0
|
36
|
+
|
37
|
+
i+=1
|
38
|
+
}
|
39
|
+
i.should==(email_end-email_start+1)
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should be able only to extract the correct interval of conversations" do
|
43
|
+
i=0
|
44
|
+
@gmail.list(1,2){ |conv_summary|
|
45
|
+
conv=@gmail.fetch_conversation(conv_summary)
|
46
|
+
i+=1
|
47
|
+
}
|
48
|
+
i.should==2
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should be able to fetch the correct number of emails in a conversations" do
|
52
|
+
@gmail.list(500,551){ |conv_summary|
|
53
|
+
conversation=@gmail.fetch_conversation(conv_summary)
|
54
|
+
conversation.emails.size.should==conversation.nb_emails
|
55
|
+
}
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
metadata
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gmail-scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "0.1"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nicolas Maisonneuve
|
8
|
+
autorequire: gmail-scraper
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-04 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: mechanize
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.9.3
|
24
|
+
version:
|
25
|
+
description:
|
26
|
+
email: n.maisonneuve@gmail.com
|
27
|
+
executables: []
|
28
|
+
|
29
|
+
extensions: []
|
30
|
+
|
31
|
+
extra_rdoc_files: []
|
32
|
+
|
33
|
+
files:
|
34
|
+
- lib/conversation.rb
|
35
|
+
- lib/conv_summary.rb
|
36
|
+
- lib/email.rb
|
37
|
+
- lib/gmail.rb
|
38
|
+
- spec/basic_spec.rb
|
39
|
+
has_rdoc: true
|
40
|
+
homepage: http://github.com/nmaisonneuve/gmail-scraper-gem
|
41
|
+
licenses: []
|
42
|
+
|
43
|
+
post_install_message:
|
44
|
+
rdoc_options: []
|
45
|
+
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: "0"
|
53
|
+
version:
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: "0"
|
59
|
+
version:
|
60
|
+
requirements: []
|
61
|
+
|
62
|
+
rubyforge_project:
|
63
|
+
rubygems_version: 1.3.5
|
64
|
+
signing_key:
|
65
|
+
specification_version: 3
|
66
|
+
summary: Scrap Gmail's emails from its HTML Version.
|
67
|
+
test_files: []
|
68
|
+
|