RubyGems - twittercrawler - Versions diffs - 0.0.1 - Mend

twittercrawler 0.0.1

Files changed (4) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 4f234ed55d4b0d82dce0b38b43d1a584596f2d45
+  data.tar.gz: 2493c492eb0e9761073ef546a4039b39a8b2f883
+SHA512:
+  metadata.gz: 023df126446294405e8ceafeaa9146d61561e05652adee497c7b78034e5146e5f0756f66c82d6273654e9bd7cf86e8ca8ee4dcda22076f8b682d3cd59fb73cdc
+  data.tar.gz: 3aea854fc09e6dfb5dcb27f449f98d1d9302eb1ca88cdbcf39d1ae30e71070b4773402d7c6f44e1167175d4bb9f879bb456d6e1da01eb2a54f085a963e9141ed

data/lib/twitter_crawler.rb ADDED Viewed

@@ -0,0 +1,65 @@
+require 'requestmanager'
+require 'selenium-webdriver'
+require 'pry'
+require 'nokogiri'
+load 'twitter_parser.rb'
+class TwitterCrawler
+  def initialize(search_term, operator, requests)
+    @search_term = search_term
+    @operator = operator
+    @requests = requests
+    @output = Array.new
+  end
+  # Generate advanced query
+  def gen_query
+    if @operator
+      return URI.encode(@search_term + " " + @operator)
+    else
+      return URI.encode(@search_term)
+    end
+  end
+  def crawl
+    @requests.get_page("https://twitter.com/search?f=tweets&q="+gen_query)
+    scroll_down(0)
+    get_tweets
+    @requests.close_all_browsers
+  end
+  # Get the tweets on the page
+  def get_tweets
+    browser = @requests.get_most_recent_browser[1].first
+    tweets = browser.find_elements(class: "tweet")
+    # Parse each tweet
+    tweets.each do |tweet|
+      tweet_html = tweet.attribute("innerHTML")
+      parser = TwitterParser.new(tweet_html)
+      @output.push(parser.parse_tweet)
+    end
+  end
+  # Scroll down to the bottom
+  def scroll_down(last_tweet_num)
+    # Scroll down to last tweet
+    browser = @requests.get_most_recent_browser[1].first
+    tweets = browser.find_elements(class: "tweet")
+    tweets[tweets.length-2].location_once_scrolled_into_view
+    # Check if it should be rerun
+    sleep(1)
+    tweet_count = browser.find_elements(class: "tweet").length
+    if tweet_count > last_tweet_num
+      scroll_down(tweet_count)
+    end
+  end
+  # Generate JSON for output
+  def gen_json
+    JSON.pretty_generate(@output)
+  end
+end

data/lib/twitter_parser.rb ADDED Viewed

@@ -0,0 +1,98 @@
+require 'nokogiri'
+require 'pry'
+class TwitterParser
+  def initialize(tweet)
+    @tweet = Nokogiri::HTML.parse(tweet)
+  end
+  # Parse the individual tweet
+  def parse_tweet
+    if !@tweet.text.empty?
+      return {
+        tweet_text: get_tweet_text,
+        username: get_username,
+        fullname: get_fullname,
+        user_id: get_user_id,
+        reply_to_user: get_reply_to_user[0],
+        reply_to_uid: get_reply_to_user[1],
+        tweet_time: get_tweet_time,
+        tweet_link: get_tweet_link,
+        retweet_count: get_retweet_count,
+        favorite_count: get_favorite_count,
+        reply_count: get_reply_count,
+        mention_names: get_mentions[0],
+        mention_uids: get_mentions[1]
+      }
+    end
+  end
+  # Get the username
+  def get_username
+    @tweet.css(".username").text
+  end
+  # Get the fullname
+  def get_fullname
+    @tweet.css(".fullname").text
+  end
+  # Get user ID number
+  def get_user_id
+    @tweet.css(".js-user-profile-link").css(".account-group")[0]["data-user-id"]
+  end
+  # Get the tweet text
+  def get_tweet_text
+    @tweet.css(".js-tweet-text-container").text.lstrip.strip
+  end
+  # Get the time for the tweet
+  def get_tweet_time
+    DateTime.parse(@tweet.css(".tweet-timestamp")[0]["title"]).strftime('%d %b %Y %H:%M:%S')
+  end
+  # Get the link to the tweet
+  def get_tweet_link
+    "https://twitter.com"+@tweet.css(".tweet-timestamp")[0]['href']
+  end
+  # Get the # of retweets
+  def get_retweet_count
+    @tweet.css(".ProfileTweet-action--retweet")[0].css("span")[0]['data-tweet-stat-count']
+  end
+  # Get the # of favorites
+  def get_favorite_count
+    @tweet.css(".ProfileTweet-action--favorite")[0].css("span")[0]['data-tweet-stat-count']
+  end
+  # Get the # of replies
+  def get_reply_count
+    @tweet.css(".ProfileTweet-action--reply")[0].css("span")[0]['data-tweet-stat-count']
+  end
+  # Get the user tweet is replying to (if any)
+  def get_reply_to_user
+    reply_to = @tweet.css("span").select{|s| s.text.include?("In reply")}[0]
+    if reply_to
+      reply_to_user = reply_to.css("a")[0]['href'].gsub("/", "@")
+      reply_to_uid = reply_to.css("a")[0]['data-user-id']
+      return reply_to_user, reply_to_uid
+    else
+      return nil, nil
+    end
+  end
+  # Get the mentioned accounts (if any)
+  def get_mentions
+    mentions = @tweet.css(".twitter-atreply")
+    if !mentions.empty?
+      mention_names = mentions.map{|t| t.text}
+      mention_uids = mentions.map{|t| t['data-mentioned-user-id']}
+      return mention_names, mention_uids
+    else
+      return nil, nil
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,46 @@
+--- !ruby/object:Gem::Specification
+name: twittercrawler
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- M. C. McGrath
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2017-02-03 00:00:00.000000000 Z
+dependencies: []
+description: Crawls Twitter
+email: shidash@shidash.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/twitter_crawler.rb
+- lib/twitter_parser.rb
+homepage: https://github.com/TransparencyToolkit/TwitterCrawler
+licenses:
+- GPL
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.8
+signing_key:
+specification_version: 4
+summary: Crawls Twitter
+test_files: []
+has_rdoc: