gmail-scraper 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,50 @@
1
+ class DraftException<Exception
2
+ end
3
+
4
+ class ConvSummary
5
+
6
+ attr_accessor :uid, :subject, :nb_emails, :updated_at, :tags, :url
7
+
8
+ def self.create_from_html(html)
9
+
10
+ conv=self.new
11
+ data=html.content.split("\n\n")
12
+ # extract link
13
+ link=html.search(".//a").first.attributes['href'].to_s
14
+
15
+ # raise an exception if there is only a draft
16
+ raise DraftException.new if (link[/draft=.*/])
17
+
18
+ conv.url=link
19
+ conv.uid=link[/th=.*/].gsub("th=", "")
20
+
21
+ #if (data.size>1)
22
+
23
+ # extract nb_emails
24
+ num=data[1][/\((\d)+\)/]
25
+ conv.nb_emails=(num.nil?)? 1 : num.gsub(/\(|\)/,"").to_i
26
+
27
+ # remove drafts from the number of emails
28
+ unless data[1][/, Draft( \((\d)\))*/].nil?
29
+ nb_drafts=/, Draft( \((\d)\))*/.match(data[1])[1]
30
+ conv.nb_emails-=(nb_drafts.nil?)? 1: nb_drafts.gsub(/\(|\)/,"").to_i
31
+ end
32
+
33
+ conv.nb_emails-=1 unless data[1][/, Draft/].nil?
34
+
35
+ # extract nb_draft
36
+ num=data[1][/\((\d)+\)/]
37
+ conv.nb_emails=(num.nil?)? 1 : num.gsub(/\(|\)/,"").to_i
38
+ # extract labels
39
+ conv.tags=data[3].split(",").collect {|s| s.strip.downcase }
40
+
41
+ # extract subject
42
+ conv.subject=data[4]
43
+ #else
44
+
45
+ # extract subject
46
+ # conv.subject=data[2]
47
+ #end
48
+ conv
49
+ end
50
+ end
@@ -0,0 +1,27 @@
1
+ require 'conv_summary.rb'
2
+ class Conversation<ConvSummary
3
+
4
+ attr_accessor :created_at, :emails
5
+
6
+ def initialize(conv_summary)
7
+ @subject=conv_summary.subject
8
+ @nb_emails=conv_summary.nb_emails
9
+ @tags=conv_summary.tags
10
+ @emails=[]
11
+ @uid=conv_summary.uid
12
+ @url=conv_summary.url
13
+ end
14
+
15
+
16
+ def add_email(email)
17
+ @emails<<email
18
+ @created_at=email.created_at if (@created_at.nil? || @created_at>email.created_at)
19
+ @updated_at=email.created_at if (@updated_at.nil? || @updated_at<email.created_at)
20
+ email.subject=@subject unless @subject.nil?
21
+
22
+ end
23
+
24
+ def sort_emails
25
+ @emails.sort
26
+ end
27
+ end
data/lib/email.rb ADDED
@@ -0,0 +1,56 @@
1
+ class Email
2
+ attr_accessor :uid, :sender, :receivers, :created_at, :text, :subject
3
+
4
+ def initialize
5
+ @receivers=[]
6
+ end
7
+
8
+ def self.create_from_html(html)
9
+
10
+ email=Email.new
11
+
12
+ tr=html.search(".//tr/td")
13
+
14
+ # sender
15
+ sender=tr[0].content.chomp.strip.gsub("/n", "")
16
+ sender=emails_to_array(sender)
17
+ email.sender=sender[0].downcase
18
+
19
+ # uid
20
+ #email.uid=conv.uid
21
+ tr[0].search(".//a").each{|link|
22
+ link=link.attributes['href'].to_s
23
+ email.uid=link[/#.*/].gsub("#", "")
24
+ }
25
+
26
+ # date
27
+ date=tr[1].content.chomp.strip.gsub("/n", "")
28
+ email.created_at=DateTime.strptime(date, '%a, %b %d, %Y at %I:%M %p')
29
+
30
+
31
+ # extract TO
32
+ receivers_to=tr[2].content.chomp.strip.gsub("/n", "")
33
+ email.receivers<<emails_to_array(receivers_to)
34
+
35
+ # extract CC
36
+ receivers_cc=tr[3].content.chomp.strip.gsub("/n", "")
37
+ email.receivers<<emails_to_array(receivers_cc)
38
+ # #puts "receiver_cc:#{receivers_cc}"
39
+
40
+ # extract Body
41
+ content=""
42
+ content = tr[4].content unless tr[4].nil?
43
+ content = tr[5].content if content.chomp.strip.gsub("/n", "").index("Reply |")==0
44
+ email.text=content.chomp.strip.gsub("/n", "")
45
+
46
+ return email
47
+ end
48
+
49
+ private
50
+ def self.emails_to_array(txt)
51
+ reg = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
52
+ txt.scan(reg).uniq
53
+ end
54
+
55
+ end
56
+
data/lib/gmail.rb ADDED
@@ -0,0 +1,122 @@
1
+ require 'rubygems'
2
+ require 'mechanize'
3
+ require 'conversation.rb'
4
+ require 'email.rb'
5
+
6
+
7
+ class ThreadNotFoundException<Exception
8
+ end
9
+
10
+ class Gmail
11
+
12
+ CONV_PER_PAGE=50
13
+ XPATH_CONV_IN_LIST="//tr[@bgcolor='#E8EEF7'] | //tr[@bgcolor='#ffffff']"
14
+ XPATH_EMAIL_IN_CONV=".//table[@bgcolor='#efefef' and @border='0']"
15
+
16
+
17
+
18
+ def initialize(email, pwd)
19
+ @email=email
20
+ @pwd=pwd
21
+ @error=false
22
+ end
23
+
24
+ def connect
25
+ base_url="http://www.gmail.com"
26
+ @agent = WWW::Mechanize.new
27
+ base_page = @agent.get base_url
28
+ login_form = base_page.forms.first
29
+ login_form.Email = @email
30
+ login_form.Passwd = @pwd
31
+ @agent.submit(login_form)
32
+ page= @agent.get "http://mail.google.com/mail/?ui=html&zy=a&s=a"
33
+
34
+ # bad(but working) method to detect the connection
35
+ connected=!(page.title.gsub("\n", "").strip=="Gmail: Email from Google")
36
+
37
+ return connected
38
+ end
39
+
40
+ # Scrap the list of summaries of conversations
41
+
42
+ def list(conv_start=0, conv_end=nil)
43
+ # display the list of emails
44
+ summary_as_html(conv_start, conv_end){|html_conv, i|
45
+ begin
46
+ c=ConvSummary.create_from_html(html_conv)
47
+ yield(c)
48
+ rescue DraftException
49
+ puts "Skiping a draft"
50
+ end
51
+
52
+ }
53
+ end
54
+
55
+ # Scrap the emails from the url of a given conversation
56
+ def fetch_conversation(conv_summary)
57
+ conv=Conversation.new(conv_summary)
58
+ conversation_as_html(conv.url){|html|
59
+ email=Email.create_from_html(html)
60
+ conv.add_email(email)
61
+ }
62
+ return conv
63
+ end
64
+
65
+ private
66
+
67
+ def summary_as_html(conv_start, conv_end)
68
+
69
+ page_index=conv_start/CONV_PER_PAGE
70
+
71
+ while (!@error)
72
+
73
+ # get the page of threads
74
+ url="?s=a&st=#{page_index*CONV_PER_PAGE}"
75
+ puts "fetching page: #{url}"
76
+ page_list=@agent.get url
77
+
78
+ # delegate each thread (html format)
79
+
80
+ page_list.search(XPATH_CONV_IN_LIST).each_with_index{|conv, i|
81
+ conv_index=page_index*CONV_PER_PAGE+i
82
+ puts "conversation_index: #{conv_index}"
83
+
84
+ if (((!conv_end.nil?) && (conv_index>conv_end)) || @error)
85
+ return
86
+ end
87
+
88
+ if (conv_index>=conv_start)
89
+ yield(conv)
90
+ end
91
+ }
92
+
93
+ # is there a next page?
94
+ if (page_list.search("//a[@href='?s=a&st=#{((page_index+1)*CONV_PER_PAGE)}']").size>0)
95
+ page_index+=1
96
+ else
97
+ @error=true
98
+ end
99
+ end
100
+ end
101
+
102
+ # Scraping emails from the HTML UI of a thread/conversation
103
+ # input a thread object or an url
104
+ # output HTML tables
105
+ def conversation_as_html(url=nil)
106
+
107
+ url+="&d=e"
108
+
109
+ # request the url of the thread
110
+ list_emails=@agent.get(url).search(XPATH_EMAIL_IN_CONV)
111
+ if list_emails.empty?
112
+ raise ThreadNotFoundException.new
113
+ else
114
+
115
+ list_emails.each{|email_html|
116
+ yield(email_html)
117
+ }
118
+ end
119
+ end
120
+
121
+ end
122
+
@@ -0,0 +1,58 @@
1
+ require 'rubygems'
2
+ require "spec"
3
+ require '../lib/gmail.rb'
4
+
5
+ describe Gmail do
6
+
7
+ before(:all) do
8
+ @login="loging"
9
+ @pass="password"
10
+ @gmail=Gmail.new(@login, @pass)
11
+ @gmail.connect
12
+ end
13
+
14
+
15
+ it "should not be connected with wrong login/password" do
16
+ gmail=Gmail.new("bar", "foo")
17
+ gmail.connect.should==false
18
+ end
19
+
20
+
21
+ it "should be connected with right login/password" do
22
+ @gmail.connect.should==true
23
+ end
24
+
25
+
26
+ it "should be able to extract a summary of the first 50 conversations" do
27
+ email_start=0
28
+ email_end=1
29
+ @gmail.connect
30
+ i=0
31
+ @gmail.list(email_start, email_end) { |thread_summary|
32
+ thread_summary.subject.should_not==nil
33
+ thread_summary.nb_emails.should>=1
34
+ thread_summary.uid.should_not==nil
35
+ thread_summary.tags.size.should>=0
36
+
37
+ i+=1
38
+ }
39
+ i.should==(email_end-email_start+1)
40
+ end
41
+
42
+ it "should be able only to extract the correct interval of conversations" do
43
+ i=0
44
+ @gmail.list(1,2){ |conv_summary|
45
+ conv=@gmail.fetch_conversation(conv_summary)
46
+ i+=1
47
+ }
48
+ i.should==2
49
+ end
50
+
51
+ it "should be able to fetch the correct number of emails in a conversations" do
52
+ @gmail.list(500,551){ |conv_summary|
53
+ conversation=@gmail.fetch_conversation(conv_summary)
54
+ conversation.emails.size.should==conversation.nb_emails
55
+ }
56
+ end
57
+
58
+ end
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gmail-scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: "0.1"
5
+ platform: ruby
6
+ authors:
7
+ - Nicolas Maisonneuve
8
+ autorequire: gmail-scraper
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-04 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: mechanize
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.9.3
24
+ version:
25
+ description:
26
+ email: n.maisonneuve@gmail.com
27
+ executables: []
28
+
29
+ extensions: []
30
+
31
+ extra_rdoc_files: []
32
+
33
+ files:
34
+ - lib/conversation.rb
35
+ - lib/conv_summary.rb
36
+ - lib/email.rb
37
+ - lib/gmail.rb
38
+ - spec/basic_spec.rb
39
+ has_rdoc: true
40
+ homepage: http://github.com/nmaisonneuve/gmail-scraper-gem
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: "0"
53
+ version:
54
+ required_rubygems_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: "0"
59
+ version:
60
+ requirements: []
61
+
62
+ rubyforge_project:
63
+ rubygems_version: 1.3.5
64
+ signing_key:
65
+ specification_version: 3
66
+ summary: Scrap Gmail's emails from its HTML Version.
67
+ test_files: []
68
+