gmail-scraper 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,50 @@
1
+ class DraftException<Exception
2
+ end
3
+
4
+ class ConvSummary
5
+
6
+ attr_accessor :uid, :subject, :nb_emails, :updated_at, :tags, :url
7
+
8
+ def self.create_from_html(html)
9
+
10
+ conv=self.new
11
+ data=html.content.split("\n\n")
12
+ # extract link
13
+ link=html.search(".//a").first.attributes['href'].to_s
14
+
15
+ # raise an exception if there is only a draft
16
+ raise DraftException.new if (link[/draft=.*/])
17
+
18
+ conv.url=link
19
+ conv.uid=link[/th=.*/].gsub("th=", "")
20
+
21
+ #if (data.size>1)
22
+
23
+ # extract nb_emails
24
+ num=data[1][/\((\d)+\)/]
25
+ conv.nb_emails=(num.nil?)? 1 : num.gsub(/\(|\)/,"").to_i
26
+
27
+ # remove drafts from the number of emails
28
+ unless data[1][/, Draft( \((\d)\))*/].nil?
29
+ nb_drafts=/, Draft( \((\d)\))*/.match(data[1])[1]
30
+ conv.nb_emails-=(nb_drafts.nil?)? 1: nb_drafts.gsub(/\(|\)/,"").to_i
31
+ end
32
+
33
+ conv.nb_emails-=1 unless data[1][/, Draft/].nil?
34
+
35
+ # extract nb_draft
36
+ num=data[1][/\((\d)+\)/]
37
+ conv.nb_emails=(num.nil?)? 1 : num.gsub(/\(|\)/,"").to_i
38
+ # extract labels
39
+ conv.tags=data[3].split(",").collect {|s| s.strip.downcase }
40
+
41
+ # extract subject
42
+ conv.subject=data[4]
43
+ #else
44
+
45
+ # extract subject
46
+ # conv.subject=data[2]
47
+ #end
48
+ conv
49
+ end
50
+ end
@@ -0,0 +1,27 @@
1
+ require 'conv_summary.rb'
2
+ class Conversation<ConvSummary
3
+
4
+ attr_accessor :created_at, :emails
5
+
6
+ def initialize(conv_summary)
7
+ @subject=conv_summary.subject
8
+ @nb_emails=conv_summary.nb_emails
9
+ @tags=conv_summary.tags
10
+ @emails=[]
11
+ @uid=conv_summary.uid
12
+ @url=conv_summary.url
13
+ end
14
+
15
+
16
+ def add_email(email)
17
+ @emails<<email
18
+ @created_at=email.created_at if (@created_at.nil? || @created_at>email.created_at)
19
+ @updated_at=email.created_at if (@updated_at.nil? || @updated_at<email.created_at)
20
+ email.subject=@subject unless @subject.nil?
21
+
22
+ end
23
+
24
+ def sort_emails
25
+ @emails.sort
26
+ end
27
+ end
data/lib/email.rb ADDED
@@ -0,0 +1,56 @@
1
+ class Email
2
+ attr_accessor :uid, :sender, :receivers, :created_at, :text, :subject
3
+
4
+ def initialize
5
+ @receivers=[]
6
+ end
7
+
8
+ def self.create_from_html(html)
9
+
10
+ email=Email.new
11
+
12
+ tr=html.search(".//tr/td")
13
+
14
+ # sender
15
+ sender=tr[0].content.chomp.strip.gsub("/n", "")
16
+ sender=emails_to_array(sender)
17
+ email.sender=sender[0].downcase
18
+
19
+ # uid
20
+ #email.uid=conv.uid
21
+ tr[0].search(".//a").each{|link|
22
+ link=link.attributes['href'].to_s
23
+ email.uid=link[/#.*/].gsub("#", "")
24
+ }
25
+
26
+ # date
27
+ date=tr[1].content.chomp.strip.gsub("/n", "")
28
+ email.created_at=DateTime.strptime(date, '%a, %b %d, %Y at %I:%M %p')
29
+
30
+
31
+ # extract TO
32
+ receivers_to=tr[2].content.chomp.strip.gsub("/n", "")
33
+ email.receivers<<emails_to_array(receivers_to)
34
+
35
+ # extract CC
36
+ receivers_cc=tr[3].content.chomp.strip.gsub("/n", "")
37
+ email.receivers<<emails_to_array(receivers_cc)
38
+ # #puts "receiver_cc:#{receivers_cc}"
39
+
40
+ # extract Body
41
+ content=""
42
+ content = tr[4].content unless tr[4].nil?
43
+ content = tr[5].content if content.chomp.strip.gsub("/n", "").index("Reply |")==0
44
+ email.text=content.chomp.strip.gsub("/n", "")
45
+
46
+ return email
47
+ end
48
+
49
+ private
50
+ def self.emails_to_array(txt)
51
+ reg = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
52
+ txt.scan(reg).uniq
53
+ end
54
+
55
+ end
56
+
data/lib/gmail.rb ADDED
@@ -0,0 +1,122 @@
1
+ require 'rubygems'
2
+ require 'mechanize'
3
+ require 'conversation.rb'
4
+ require 'email.rb'
5
+
6
+
7
+ class ThreadNotFoundException<Exception
8
+ end
9
+
10
+ class Gmail
11
+
12
+ CONV_PER_PAGE=50
13
+ XPATH_CONV_IN_LIST="//tr[@bgcolor='#E8EEF7'] | //tr[@bgcolor='#ffffff']"
14
+ XPATH_EMAIL_IN_CONV=".//table[@bgcolor='#efefef' and @border='0']"
15
+
16
+
17
+
18
+ def initialize(email, pwd)
19
+ @email=email
20
+ @pwd=pwd
21
+ @error=false
22
+ end
23
+
24
+ def connect
25
+ base_url="http://www.gmail.com"
26
+ @agent = WWW::Mechanize.new
27
+ base_page = @agent.get base_url
28
+ login_form = base_page.forms.first
29
+ login_form.Email = @email
30
+ login_form.Passwd = @pwd
31
+ @agent.submit(login_form)
32
+ page= @agent.get "http://mail.google.com/mail/?ui=html&zy=a&s=a"
33
+
34
+ # bad(but working) method to detect the connection
35
+ connected=!(page.title.gsub("\n", "").strip=="Gmail: Email from Google")
36
+
37
+ return connected
38
+ end
39
+
40
+ # Scrap the list of summaries of conversations
41
+
42
+ def list(conv_start=0, conv_end=nil)
43
+ # display the list of emails
44
+ summary_as_html(conv_start, conv_end){|html_conv, i|
45
+ begin
46
+ c=ConvSummary.create_from_html(html_conv)
47
+ yield(c)
48
+ rescue DraftException
49
+ puts "Skiping a draft"
50
+ end
51
+
52
+ }
53
+ end
54
+
55
+ # Scrap the emails from the url of a given conversation
56
+ def fetch_conversation(conv_summary)
57
+ conv=Conversation.new(conv_summary)
58
+ conversation_as_html(conv.url){|html|
59
+ email=Email.create_from_html(html)
60
+ conv.add_email(email)
61
+ }
62
+ return conv
63
+ end
64
+
65
+ private
66
+
67
+ def summary_as_html(conv_start, conv_end)
68
+
69
+ page_index=conv_start/CONV_PER_PAGE
70
+
71
+ while (!@error)
72
+
73
+ # get the page of threads
74
+ url="?s=a&st=#{page_index*CONV_PER_PAGE}"
75
+ puts "fetching page: #{url}"
76
+ page_list=@agent.get url
77
+
78
+ # delegate each thread (html format)
79
+
80
+ page_list.search(XPATH_CONV_IN_LIST).each_with_index{|conv, i|
81
+ conv_index=page_index*CONV_PER_PAGE+i
82
+ puts "conversation_index: #{conv_index}"
83
+
84
+ if (((!conv_end.nil?) && (conv_index>conv_end)) || @error)
85
+ return
86
+ end
87
+
88
+ if (conv_index>=conv_start)
89
+ yield(conv)
90
+ end
91
+ }
92
+
93
+ # is there a next page?
94
+ if (page_list.search("//a[@href='?s=a&st=#{((page_index+1)*CONV_PER_PAGE)}']").size>0)
95
+ page_index+=1
96
+ else
97
+ @error=true
98
+ end
99
+ end
100
+ end
101
+
102
+ # Scraping emails from the HTML UI of a thread/conversation
103
+ # input a thread object or an url
104
+ # output HTML tables
105
+ def conversation_as_html(url=nil)
106
+
107
+ url+="&d=e"
108
+
109
+ # request the url of the thread
110
+ list_emails=@agent.get(url).search(XPATH_EMAIL_IN_CONV)
111
+ if list_emails.empty?
112
+ raise ThreadNotFoundException.new
113
+ else
114
+
115
+ list_emails.each{|email_html|
116
+ yield(email_html)
117
+ }
118
+ end
119
+ end
120
+
121
+ end
122
+
@@ -0,0 +1,58 @@
1
+ require 'rubygems'
2
+ require "spec"
3
+ require '../lib/gmail.rb'
4
+
5
+ describe Gmail do
6
+
7
+ before(:all) do
8
+ @login="loging"
9
+ @pass="password"
10
+ @gmail=Gmail.new(@login, @pass)
11
+ @gmail.connect
12
+ end
13
+
14
+
15
+ it "should not be connected with wrong login/password" do
16
+ gmail=Gmail.new("bar", "foo")
17
+ gmail.connect.should==false
18
+ end
19
+
20
+
21
+ it "should be connected with right login/password" do
22
+ @gmail.connect.should==true
23
+ end
24
+
25
+
26
+ it "should be able to extract a summary of the first 50 conversations" do
27
+ email_start=0
28
+ email_end=1
29
+ @gmail.connect
30
+ i=0
31
+ @gmail.list(email_start, email_end) { |thread_summary|
32
+ thread_summary.subject.should_not==nil
33
+ thread_summary.nb_emails.should>=1
34
+ thread_summary.uid.should_not==nil
35
+ thread_summary.tags.size.should>=0
36
+
37
+ i+=1
38
+ }
39
+ i.should==(email_end-email_start+1)
40
+ end
41
+
42
+ it "should be able only to extract the correct interval of conversations" do
43
+ i=0
44
+ @gmail.list(1,2){ |conv_summary|
45
+ conv=@gmail.fetch_conversation(conv_summary)
46
+ i+=1
47
+ }
48
+ i.should==2
49
+ end
50
+
51
+ it "should be able to fetch the correct number of emails in a conversations" do
52
+ @gmail.list(500,551){ |conv_summary|
53
+ conversation=@gmail.fetch_conversation(conv_summary)
54
+ conversation.emails.size.should==conversation.nb_emails
55
+ }
56
+ end
57
+
58
+ end
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gmail-scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: "0.1"
5
+ platform: ruby
6
+ authors:
7
+ - Nicolas Maisonneuve
8
+ autorequire: gmail-scraper
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-04 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: mechanize
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.9.3
24
+ version:
25
+ description:
26
+ email: n.maisonneuve@gmail.com
27
+ executables: []
28
+
29
+ extensions: []
30
+
31
+ extra_rdoc_files: []
32
+
33
+ files:
34
+ - lib/conversation.rb
35
+ - lib/conv_summary.rb
36
+ - lib/email.rb
37
+ - lib/gmail.rb
38
+ - spec/basic_spec.rb
39
+ has_rdoc: true
40
+ homepage: http://github.com/nmaisonneuve/gmail-scraper-gem
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: "0"
53
+ version:
54
+ required_rubygems_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: "0"
59
+ version:
60
+ requirements: []
61
+
62
+ rubyforge_project:
63
+ rubygems_version: 1.3.5
64
+ signing_key:
65
+ specification_version: 3
66
+ summary: Scrap Gmail's emails from its HTML Version.
67
+ test_files: []
68
+