gmail-scraper 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- require 'conv_summary.rb'
1
+ require 'lib/conv_summary.rb'
2
2
  class Conversation<ConvSummary
3
3
 
4
4
  attr_accessor :created_at, :emails
@@ -1,7 +1,7 @@
1
1
  require 'rubygems'
2
2
  require 'mechanize'
3
- require 'conversation.rb'
4
- require 'email.rb'
3
+ require 'lib/conversation.rb'
4
+ require 'lib/email.rb'
5
5
 
6
6
 
7
7
  class ThreadNotFoundException<Exception
@@ -13,7 +13,6 @@ class Gmail
13
13
  XPATH_CONV_IN_LIST="//tr[@bgcolor='#E8EEF7'] | //tr[@bgcolor='#ffffff']"
14
14
  XPATH_EMAIL_IN_CONV=".//table[@bgcolor='#efefef' and @border='0']"
15
15
 
16
-
17
16
 
18
17
  def initialize(email, pwd)
19
18
  @email=email
@@ -24,30 +23,32 @@ class Gmail
24
23
  def connect
25
24
  base_url="http://www.gmail.com"
26
25
  @agent = WWW::Mechanize.new
26
+ @agent.follow_meta_refresh = true
27
27
  base_page = @agent.get base_url
28
28
  login_form = base_page.forms.first
29
29
  login_form.Email = @email
30
30
  login_form.Passwd = @pwd
31
31
  @agent.submit(login_form)
32
- page= @agent.get "http://mail.google.com/mail/?ui=html&zy=a&s=a"
32
+ page= @agent.get "http://mail.google.com/mail/?ui=html&zy=a"
33
33
 
34
34
  # bad(but working) method to detect the connection
35
- connected=!(page.title.gsub("\n", "").strip=="Gmail: Email from Google")
35
+ connected=!(page.title[/Inbox/].nil?)
36
36
 
37
37
  return connected
38
38
  end
39
39
 
40
40
  # Scrap the list of summaries of conversations
41
41
 
42
+ def search(tag, conv_start=0, conv_end=nil)
43
+ summary_as_html("s=l&l=#{tag}", conv_start, conv_end){|html_conv, i|
44
+ yield(html_conv, i)
45
+ }
46
+ end
47
+
42
48
  def list(conv_start=0, conv_end=nil)
43
49
  # display the list of emails
44
- summary_as_html(conv_start, conv_end){|html_conv, i|
45
- begin
46
- c=ConvSummary.create_from_html(html_conv)
47
- yield(c)
48
- rescue DraftException
49
- puts "Skiping a draft"
50
- end
50
+ summary_as_html("s=a", conv_start, conv_end){|html_conv, i|
51
+
51
52
 
52
53
  }
53
54
  end
@@ -64,14 +65,16 @@ class Gmail
64
65
 
65
66
  private
66
67
 
67
- def summary_as_html(conv_start, conv_end)
68
+
69
+ def summary_as_html(mode, conv_start, conv_end)
68
70
 
69
71
  page_index=conv_start/CONV_PER_PAGE
70
72
 
71
73
  while (!@error)
72
74
 
73
75
  # get the page of threads
74
- url="?s=a&st=#{page_index*CONV_PER_PAGE}"
76
+ url="?#{mode}&st=#{page_index*CONV_PER_PAGE}"
77
+
75
78
  puts "fetching page: #{url}"
76
79
  page_list=@agent.get url
77
80
 
@@ -79,14 +82,18 @@ class Gmail
79
82
 
80
83
  page_list.search(XPATH_CONV_IN_LIST).each_with_index{|conv, i|
81
84
  conv_index=page_index*CONV_PER_PAGE+i
82
- puts "conversation_index: #{conv_index}"
83
85
 
84
86
  if (((!conv_end.nil?) && (conv_index>conv_end)) || @error)
85
87
  return
86
88
  end
87
89
 
88
90
  if (conv_index>=conv_start)
89
- yield(conv)
91
+ begin
92
+ c=ConvSummary.create_from_html(conv)
93
+ yield(c)
94
+ rescue DraftException
95
+ puts "Skiping a draft"
96
+ end
90
97
  end
91
98
  }
92
99
 
@@ -24,11 +24,11 @@ describe Gmail do
24
24
 
25
25
 
26
26
  it "should be able to extract a summary of the first 50 conversations" do
27
- email_start=0
28
- email_end=1
27
+ conv_start=0
28
+ conv_end=49
29
29
  @gmail.connect
30
30
  i=0
31
- @gmail.list(email_start, email_end) { |thread_summary|
31
+ @gmail.list(conv_start, conv_end) { |thread_summary|
32
32
  thread_summary.subject.should_not==nil
33
33
  thread_summary.nb_emails.should>=1
34
34
  thread_summary.uid.should_not==nil
@@ -36,17 +36,10 @@ describe Gmail do
36
36
 
37
37
  i+=1
38
38
  }
39
- i.should==(email_end-email_start+1)
39
+ i.should==(conv_end-conv_start+1)
40
40
  end
41
41
 
42
- it "should be able only to extract the correct interval of conversations" do
43
- i=0
44
- @gmail.list(1,2){ |conv_summary|
45
- conv=@gmail.fetch_conversation(conv_summary)
46
- i+=1
47
- }
48
- i.should==2
49
- end
42
+
50
43
 
51
44
  it "should be able to fetch the correct number of emails in a conversations" do
52
45
  @gmail.list(500,551){ |conv_summary|
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gmail-scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: "0.1"
4
+ version: "0.2"
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nicolas Maisonneuve
8
- autorequire: gmail-scraper
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-04 00:00:00 +01:00
12
+ date: 2010-02-26 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency