gmail-scraper 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/conversation.rb +1 -1
- data/lib/gmail.rb +23 -16
- data/spec/basic_spec.rb +5 -12
- metadata +3 -3
data/lib/conversation.rb
CHANGED
data/lib/gmail.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'mechanize'
|
3
|
-
require 'conversation.rb'
|
4
|
-
require 'email.rb'
|
3
|
+
require 'lib/conversation.rb'
|
4
|
+
require 'lib/email.rb'
|
5
5
|
|
6
6
|
|
7
7
|
class ThreadNotFoundException<Exception
|
@@ -13,7 +13,6 @@ class Gmail
|
|
13
13
|
XPATH_CONV_IN_LIST="//tr[@bgcolor='#E8EEF7'] | //tr[@bgcolor='#ffffff']"
|
14
14
|
XPATH_EMAIL_IN_CONV=".//table[@bgcolor='#efefef' and @border='0']"
|
15
15
|
|
16
|
-
|
17
16
|
|
18
17
|
def initialize(email, pwd)
|
19
18
|
@email=email
|
@@ -24,30 +23,32 @@ class Gmail
|
|
24
23
|
def connect
|
25
24
|
base_url="http://www.gmail.com"
|
26
25
|
@agent = WWW::Mechanize.new
|
26
|
+
@agent.follow_meta_refresh = true
|
27
27
|
base_page = @agent.get base_url
|
28
28
|
login_form = base_page.forms.first
|
29
29
|
login_form.Email = @email
|
30
30
|
login_form.Passwd = @pwd
|
31
31
|
@agent.submit(login_form)
|
32
|
-
page= @agent.get "http://mail.google.com/mail/?ui=html&zy=a
|
32
|
+
page= @agent.get "http://mail.google.com/mail/?ui=html&zy=a"
|
33
33
|
|
34
34
|
# bad(but working) method to detect the connection
|
35
|
-
connected=!(page.title.
|
35
|
+
connected=!(page.title[/Inbox/].nil?)
|
36
36
|
|
37
37
|
return connected
|
38
38
|
end
|
39
39
|
|
40
40
|
# Scrap the list of summaries of conversations
|
41
41
|
|
42
|
+
def search(tag, conv_start=0, conv_end=nil)
|
43
|
+
summary_as_html("s=l&l=#{tag}", conv_start, conv_end){|html_conv, i|
|
44
|
+
yield(html_conv, i)
|
45
|
+
}
|
46
|
+
end
|
47
|
+
|
42
48
|
def list(conv_start=0, conv_end=nil)
|
43
49
|
# display the list of emails
|
44
|
-
summary_as_html(conv_start, conv_end){|html_conv, i|
|
45
|
-
|
46
|
-
c=ConvSummary.create_from_html(html_conv)
|
47
|
-
yield(c)
|
48
|
-
rescue DraftException
|
49
|
-
puts "Skiping a draft"
|
50
|
-
end
|
50
|
+
summary_as_html("s=a", conv_start, conv_end){|html_conv, i|
|
51
|
+
|
51
52
|
|
52
53
|
}
|
53
54
|
end
|
@@ -64,14 +65,16 @@ class Gmail
|
|
64
65
|
|
65
66
|
private
|
66
67
|
|
67
|
-
|
68
|
+
|
69
|
+
def summary_as_html(mode, conv_start, conv_end)
|
68
70
|
|
69
71
|
page_index=conv_start/CONV_PER_PAGE
|
70
72
|
|
71
73
|
while (!@error)
|
72
74
|
|
73
75
|
# get the page of threads
|
74
|
-
url="
|
76
|
+
url="?#{mode}&st=#{page_index*CONV_PER_PAGE}"
|
77
|
+
|
75
78
|
puts "fetching page: #{url}"
|
76
79
|
page_list=@agent.get url
|
77
80
|
|
@@ -79,14 +82,18 @@ class Gmail
|
|
79
82
|
|
80
83
|
page_list.search(XPATH_CONV_IN_LIST).each_with_index{|conv, i|
|
81
84
|
conv_index=page_index*CONV_PER_PAGE+i
|
82
|
-
puts "conversation_index: #{conv_index}"
|
83
85
|
|
84
86
|
if (((!conv_end.nil?) && (conv_index>conv_end)) || @error)
|
85
87
|
return
|
86
88
|
end
|
87
89
|
|
88
90
|
if (conv_index>=conv_start)
|
89
|
-
|
91
|
+
begin
|
92
|
+
c=ConvSummary.create_from_html(conv)
|
93
|
+
yield(c)
|
94
|
+
rescue DraftException
|
95
|
+
puts "Skiping a draft"
|
96
|
+
end
|
90
97
|
end
|
91
98
|
}
|
92
99
|
|
data/spec/basic_spec.rb
CHANGED
@@ -24,11 +24,11 @@ describe Gmail do
|
|
24
24
|
|
25
25
|
|
26
26
|
it "should be able to extract a summary of the first 50 conversations" do
|
27
|
-
|
28
|
-
|
27
|
+
conv_start=0
|
28
|
+
conv_end=49
|
29
29
|
@gmail.connect
|
30
30
|
i=0
|
31
|
-
@gmail.list(
|
31
|
+
@gmail.list(conv_start, conv_end) { |thread_summary|
|
32
32
|
thread_summary.subject.should_not==nil
|
33
33
|
thread_summary.nb_emails.should>=1
|
34
34
|
thread_summary.uid.should_not==nil
|
@@ -36,17 +36,10 @@ describe Gmail do
|
|
36
36
|
|
37
37
|
i+=1
|
38
38
|
}
|
39
|
-
i.should==(
|
39
|
+
i.should==(conv_end-conv_start+1)
|
40
40
|
end
|
41
41
|
|
42
|
-
|
43
|
-
i=0
|
44
|
-
@gmail.list(1,2){ |conv_summary|
|
45
|
-
conv=@gmail.fetch_conversation(conv_summary)
|
46
|
-
i+=1
|
47
|
-
}
|
48
|
-
i.should==2
|
49
|
-
end
|
42
|
+
|
50
43
|
|
51
44
|
it "should be able to fetch the correct number of emails in a conversations" do
|
52
45
|
@gmail.list(500,551){ |conv_summary|
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gmail-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: "0.
|
4
|
+
version: "0.2"
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nicolas Maisonneuve
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-02-26 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|