RubyGems - gmail-scraper - Versions diffs - 0.1 - Mend

gmail-scraper 0.1

Files changed (6) hide show

data/lib/conv_summary.rb ADDED Viewed

@@ -0,0 +1,50 @@
+class DraftException<Exception
+end
+class ConvSummary
+  attr_accessor :uid, :subject, :nb_emails, :updated_at, :tags, :url
+  def self.create_from_html(html)
+    conv=self.new
+    data=html.content.split("\n\n")
+    # extract link
+    link=html.search(".//a").first.attributes['href'].to_s
+    # raise an exception if there is only a draft
+    raise DraftException.new if (link[/draft=.*/])
+    conv.url=link
+    conv.uid=link[/th=.*/].gsub("th=", "")
+    #if (data.size>1)
+    # extract nb_emails
+    num=data[1][/\((\d)+\)/]
+    conv.nb_emails=(num.nil?)? 1 : num.gsub(/\(|\)/,"").to_i
+    # remove drafts from the number of emails
+    unless data[1][/, Draft( \((\d)\))*/].nil?
+           nb_drafts=/, Draft( \((\d)\))*/.match(data[1])[1]
+           conv.nb_emails-=(nb_drafts.nil?)? 1: nb_drafts.gsub(/\(|\)/,"").to_i
+    end
+    conv.nb_emails-=1 unless data[1][/, Draft/].nil?
+   # extract nb_draft
+    num=data[1][/\((\d)+\)/]
+    conv.nb_emails=(num.nil?)? 1 : num.gsub(/\(|\)/,"").to_i
+    # extract labels
+    conv.tags=data[3].split(",").collect {|s| s.strip.downcase }
+    # extract subject
+    conv.subject=data[4]
+    #else
+    # extract subject
+    # conv.subject=data[2]
+    #end
+    conv
+  end
+end

data/lib/conversation.rb ADDED Viewed

@@ -0,0 +1,27 @@
+require 'conv_summary.rb'
+class Conversation<ConvSummary
+  attr_accessor :created_at, :emails
+  def initialize(conv_summary)
+     @subject=conv_summary.subject
+     @nb_emails=conv_summary.nb_emails
+     @tags=conv_summary.tags
+     @emails=[]
+     @uid=conv_summary.uid
+     @url=conv_summary.url
+  end
+  def add_email(email)
+    @emails<<email
+    @created_at=email.created_at if (@created_at.nil? || @created_at>email.created_at)
+    @updated_at=email.created_at if (@updated_at.nil? || @updated_at<email.created_at)
+    email.subject=@subject unless @subject.nil?
+  end
+  def sort_emails
+    @emails.sort
+  end
+end

data/lib/email.rb ADDED Viewed

@@ -0,0 +1,56 @@
+class Email
+  attr_accessor :uid, :sender, :receivers, :created_at, :text, :subject
+  def initialize
+    @receivers=[]
+  end
+  def self.create_from_html(html)
+    email=Email.new
+    tr=html.search(".//tr/td")
+    # sender
+    sender=tr[0].content.chomp.strip.gsub("/n", "")
+    sender=emails_to_array(sender)
+    email.sender=sender[0].downcase
+    # uid
+    #email.uid=conv.uid
+    tr[0].search(".//a").each{|link|
+      link=link.attributes['href'].to_s
+      email.uid=link[/#.*/].gsub("#", "")
+    }
+    # date
+    date=tr[1].content.chomp.strip.gsub("/n", "")
+    email.created_at=DateTime.strptime(date, '%a, %b %d, %Y at %I:%M %p')
+    # extract TO
+    receivers_to=tr[2].content.chomp.strip.gsub("/n", "")
+    email.receivers<<emails_to_array(receivers_to)
+    # extract CC
+    receivers_cc=tr[3].content.chomp.strip.gsub("/n", "")
+    email.receivers<<emails_to_array(receivers_cc)
+    # #puts  "receiver_cc:#{receivers_cc}"
+    # extract Body
+    content=""
+    content = tr[4].content unless tr[4].nil?
+    content = tr[5].content if content.chomp.strip.gsub("/n", "").index("Reply |")==0
+    email.text=content.chomp.strip.gsub("/n", "")
+    return email
+  end
+  private
+  def self.emails_to_array(txt)
+    reg = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
+    txt.scan(reg).uniq
+  end
+end

data/lib/gmail.rb ADDED Viewed

@@ -0,0 +1,122 @@
+require 'rubygems'
+require 'mechanize'
+require 'conversation.rb'
+require 'email.rb'
+class ThreadNotFoundException<Exception
+end
+class Gmail
+  CONV_PER_PAGE=50
+  XPATH_CONV_IN_LIST="//tr[@bgcolor='#E8EEF7'] | //tr[@bgcolor='#ffffff']"
+  XPATH_EMAIL_IN_CONV=".//table[@bgcolor='#efefef' and @border='0']"
+  def initialize(email, pwd)
+    @email=email
+    @pwd=pwd
+    @error=false
+  end
+  def connect
+    base_url="http://www.gmail.com"
+    @agent = WWW::Mechanize.new
+    base_page = @agent.get base_url
+    login_form = base_page.forms.first
+    login_form.Email = @email
+    login_form.Passwd = @pwd
+    @agent.submit(login_form)
+    page= @agent.get "http://mail.google.com/mail/?ui=html&zy=a&s=a"
+    # bad(but working) method to detect the connection
+    connected=!(page.title.gsub("\n", "").strip=="Gmail: Email from Google")
+    return connected
+  end
+  # Scrap the list of summaries of conversations
+  def list(conv_start=0, conv_end=nil)
+    # display the list of emails
+    summary_as_html(conv_start, conv_end){|html_conv, i|
+        begin
+            c=ConvSummary.create_from_html(html_conv)
+          yield(c)
+        rescue DraftException
+          puts "Skiping a draft"
+        end
+    }
+  end
+  # Scrap the emails from the url of a given conversation
+  def fetch_conversation(conv_summary)
+    conv=Conversation.new(conv_summary)
+    conversation_as_html(conv.url){|html|
+      email=Email.create_from_html(html)
+      conv.add_email(email)
+    }
+    return conv
+  end
+  private
+  def summary_as_html(conv_start, conv_end)
+    page_index=conv_start/CONV_PER_PAGE
+    while (!@error)
+      # get the page of threads
+      url="?s=a&st=#{page_index*CONV_PER_PAGE}"
+      puts "fetching page: #{url}"
+      page_list=@agent.get url
+      # delegate each thread (html format)
+      page_list.search(XPATH_CONV_IN_LIST).each_with_index{|conv, i|
+        conv_index=page_index*CONV_PER_PAGE+i
+        puts "conversation_index: #{conv_index}"
+        if (((!conv_end.nil?) && (conv_index>conv_end)) || @error)
+          return
+        end
+        if (conv_index>=conv_start)
+              yield(conv)
+        end
+      }
+      # is there a next page?
+      if (page_list.search("//a[@href='?s=a&st=#{((page_index+1)*CONV_PER_PAGE)}']").size>0)
+        page_index+=1
+      else
+        @error=true
+      end
+    end
+  end
+# Scraping emails from the HTML UI of a thread/conversation
+# input a thread object or an url
+# output HTML tables
+  def conversation_as_html(url=nil)
+    url+="&d=e"
+    # request the url of the thread
+    list_emails=@agent.get(url).search(XPATH_EMAIL_IN_CONV)
+    if list_emails.empty?
+      raise ThreadNotFoundException.new
+    else
+      list_emails.each{|email_html|
+        yield(email_html)
+      }
+    end
+  end
+end

data/spec/basic_spec.rb ADDED Viewed

@@ -0,0 +1,58 @@
+require 'rubygems'
+require "spec"
+require '../lib/gmail.rb'
+describe Gmail do
+  before(:all) do
+    @login="loging"
+    @pass="password"
+    @gmail=Gmail.new(@login, @pass)
+    @gmail.connect
+  end
+  it "should not be connected with wrong login/password" do
+    gmail=Gmail.new("bar", "foo")
+    gmail.connect.should==false
+  end
+  it "should be connected with right login/password" do
+    @gmail.connect.should==true
+  end
+  it "should be able to extract a summary of the first 50 conversations" do
+    email_start=0
+    email_end=1
+    @gmail.connect
+    i=0
+    @gmail.list(email_start, email_end) { |thread_summary|
+      thread_summary.subject.should_not==nil
+      thread_summary.nb_emails.should>=1
+      thread_summary.uid.should_not==nil
+      thread_summary.tags.size.should>=0
+      i+=1
+    }
+    i.should==(email_end-email_start+1)
+  end
+  it "should be able only to extract the correct interval of conversations" do
+    i=0
+    @gmail.list(1,2){ |conv_summary|
+      conv=@gmail.fetch_conversation(conv_summary)
+       i+=1
+    }
+    i.should==2
+  end
+  it "should be able to fetch the correct number of emails in a conversations" do
+    @gmail.list(500,551){ |conv_summary|
+        conversation=@gmail.fetch_conversation(conv_summary)
+        conversation.emails.size.should==conversation.nb_emails
+    }
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,68 @@
+--- !ruby/object:Gem::Specification
+name: gmail-scraper
+version: !ruby/object:Gem::Version
+  version: "0.1"
+platform: ruby
+authors:
+- Nicolas Maisonneuve
+autorequire: gmail-scraper
+bindir: bin
+cert_chain: []
+date: 2010-01-04 00:00:00 +01:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.9.3
+    version:
+description:
+email: n.maisonneuve@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/conversation.rb
+- lib/conv_summary.rb
+- lib/email.rb
+- lib/gmail.rb
+- spec/basic_spec.rb
+has_rdoc: true
+homepage: http://github.com/nmaisonneuve/gmail-scraper-gem
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Scrap Gmail's emails from its HTML Version.
+test_files: []