RubyGems - whos_using_what - Versions diffs - 0.3.1 → 0.3.3 - Mend

whos_using_what 0.3.1 → 0.3.3

Files changed (7) hide show

data/lib/whos_using_what/api_clients/base_api_client.rb +54 -0
data/lib/whos_using_what/api_clients/google_client.rb +177 -0
data/lib/whos_using_what/base.rb +5 -1
data/lib/whos_using_what/data_gatherers/tech_ad_tagger.rb +15 -6
data/lib/whos_using_what/scripts/data_populators.rb +28 -13
metadata +50 -2
data/lib/whos_using_what/api_clients/whos_using_what_search_client.rb +0 -137

data/lib/whos_using_what/api_clients/base_api_client.rb CHANGED Viewed

@@ -2,6 +2,60 @@ require_relative "../base"
 class BaseApiClient < Base
+  require "uri"
+  require "rest-client"
+  def arraySearch(array, rawHtml)
+    rawHtml = rawHtml.downcase
+    array.each do |token|
+      if (rawHtml.index(token) != nil)
+        return true
+      end
+    end
+    return false
+  end
+  def arry_to_str_delim array, delim
+    str = ""
+    i = 0
+    array.each do |entry|
+      if i < 1
+        str = entry.strip
+      else
+        str = str << delim << entry.strip
+      end
+      i += 1
+    end
+    str.strip
+  end
+  def cleanup_url url
+    #clean up url
+    url = url.strip
+    if url["www."] != nil
+      url["www."] = ""
+    end
+    if url["site:"] != nil
+      url["site:"] = ""
+    end
+    url
+  end
+  def determineIfUsesTechnology(technology, rawHtml)
+    isJobPage = arraySearch(@jobPageTokens, rawHtml)
+    return isJobPage
+  end
   def starts_with?(string, prefix)
     prefix = prefix.to_s
     string[0, prefix.length] == prefix

data/lib/whos_using_what/api_clients/google_client.rb ADDED Viewed

@@ -0,0 +1,177 @@
+require_relative "base_api_client"
+require 'mechanize'
+require 'watir-webdriver'
+require 'headless'
+class GoogleClient < BaseApiClient
+  attr :results
+  def initialize
+    @negativeMatchUrlPatterns = ['google', 'youtube', 'duckduckgo', 'bing', 'yahoo']
+    @positiveMatchUrlPatterns = ['http', 'www']
+    @technologiesToSearchFor = ['ruby', 'java', 'javascript', 'python']
+    @jobPageTokens = ['job', 'hiring', 'career']
+    @results = Hash.new
+    @mechanize = Mechanize.new
+    headless = Headless.new
+    headless.start
+    @browser = Watir::Browser.new :firefox
+  end
+  def extractUrls (rawInput, mustContainUrl)
+    acceptedUrls = Array.new
+    if (rawInput == nil)
+      return acceptedUrls
+    end
+    urls = []
+    begin
+      urls = URI.extract(rawInput)
+    end
+    if urls.size < 1
+      return acceptedUrls
+    end
+    mustContainUrl = cleanup_url mustContainUrl
+    urls.each do |url|
+      url = cleanup_url url
+      accept_url_bool = false
+      @positiveMatchUrlPatterns.each do |token|
+        if (starts_with? url, token) ||
+            (starts_with? url, mustContainUrl)
+          accept_url_bool = true
+          break
+        end
+      end
+      if !accept_url_bool
+        next
+      end
+      if !(url.include? mustContainUrl)
+        accept_url_bool = false
+      end
+      @negativeMatchUrlPatterns.each do |token|
+        if url.include? token
+          accept_url_bool = false
+          break
+        end
+      end
+      url = cleanup_url url
+      if accept_url_bool &&
+          url != nil && !(acceptedUrls.include? url)
+        acceptedUrls.push url
+      end
+    end
+    acceptedUrls
+  end
+  def generate_google_url site_url, search_keyword
+    query_url = [
+        "http://www.google.com/search?",
+        "hl=en",
+        "&as_q=" << "hiring+" << search_keyword,
+        "&as_sitesearch=" << (cleanup_url (site_url))
+    ]
+    url = arry_to_str_delim query_url, ""
+  end
+  def generate_duckduckgo_url site_url, search_keyword
+    query_url = [
+        "http://duckduckgo.com/?",
+        "q=" <<
+            "site:" << (cleanup_url (site_url)) <<
+            "+hiring+" << search_keyword,
+    ]
+    url = arry_to_str_delim query_url, ""
+  end
+  #performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
+  #If an ad exists it is returned as part of map
+  def google_search queries, site_url
+    ret_map = Hash.new
+    @technologiesToSearchFor.each do |search_keyword|
+      url = ""
+      raw_html = ""
+      begin
+        url = generate_duckduckgo_url site_url, search_keyword
+        #perform initial search engine search
+        @browser.goto url
+        raw_html = @browser.html
+        puts "successfully queried url:" << url
+      rescue Exception => e
+        puts "exception:" << e.message << " when querying url: " << url
+      end
+      urls = extractUrls(raw_html, site_url)
+      urls.each do |cur_url|
+        begin
+          @browser.goto cur_url
+          html = @browser.html
+          #strip all html tags, for human readability and to cut down on some errors that could arise
+          # TODO this was causing an exception
+          #  html = html.gsub!(/(<[^>]*>)|\n|\t/s) { " " }
+          uses_technology = determineIfUsesTechnology(search_keyword, html)
+          if (uses_technology)
+            ret_map[search_keyword] = cur_url
+          end
+        rescue Exception => e
+          puts e.message
+        end
+      end
+    end
+    #throttle queries to avoid being black-listed by search engine
+    sleep_seconds = rand(1-5)
+    sleep sleep_seconds
+    ret_map
+  end
+end

data/lib/whos_using_what/base.rb CHANGED Viewed

@@ -1,5 +1,9 @@
 class Base
+  attr :set_paths
+  @@paths_set = false
   def self.set_paths
     $:.unshift(File.expand_path('../data_gatherers', __FILE__))
     $:.unshift(File.expand_path('../data_searchers', __FILE__))
@@ -8,9 +12,9 @@ class Base
     $:.unshift(File.expand_path('../util', __FILE__))
     $:.unshift(File.expand_path('../logging', __FILE__))
+    @@paths_set = true
   end
   set_paths
 end

data/lib/whos_using_what/data_gatherers/tech_ad_tagger.rb CHANGED Viewed

@@ -2,11 +2,11 @@ require_relative "../base"
 class TechAdTagger < Base
-  require 'whos_using_what_search_client'
+  def initialize
+    require_relative '../api_clients/google_client'
-  def initialize
-    @search_client = WhosUsingWhatSearchClient.new
+    @search_client = GoogleClient.new
     @mongo_client = MongoHelper.get_mongo_connection
     @companies_coll = @mongo_client['companies']
@@ -18,9 +18,18 @@ class TechAdTagger < Base
   #iterates through array and updates company db record with technologies found from ads from their website
   def tag_company_with_technologies tech_keywords
+    # uncomment if need to clear out all existing technologies
+=begin
+    @companies_coll.find().each do |company|
+      company['languages'] = {}
+      @companies_coll.update({"_id" => company["_id"]}, company)
+    end
+=end
     companies = @companies_coll.find(
-        "languages" => {"$exists" => false}
-    )
+        # "languages" => {"$exists" => false}
+        # "languages" => {}
+    ).to_a
     languages = Hash.new
@@ -28,7 +37,7 @@ class TechAdTagger < Base
       languages = Hash.new
-      company_languages_map = @search_client.search tech_keywords, company["websiteUrl"]
+      company_languages_map = @search_client.google_search tech_keywords, company["websiteUrl"]
       company_languages_map.each do |key, value|

data/lib/whos_using_what/scripts/data_populators.rb CHANGED Viewed

@@ -34,10 +34,10 @@ class DataPopulators
     @@geo_tagger = GeoTagger.new @@log
     @@gather_companies = GatherCompanies.new
     @@companies_searcher = CompaniesSearcher.new @@geo_tagger
-    @@ech_ad_tagger = TechAdTagger.new
+    @@tech_ad_tagger = TechAdTagger.new
     #data holders
-    @@facet_location = "us:82" #Sacramento
+    @@facet_location = "us:84"
     @@programming_languages = ["java", "ruby", "c#", "php", "python", "javascript"]
@@ -48,19 +48,34 @@ class DataPopulators
   if __FILE__ == $PROGRAM_NAME
-    begin
+    t1 = Thread.new do
-      t1 = Thread.new do
+      begin
-        @@geo_tagger.load_geolocations_into_db
+       # @@gather_companies.load_companies_to_db 700, 0, @@facet_location
+      rescue Exception => e
+        puts e.message
+        puts e.backtrace
       end
-    rescue Exception => e
-      puts e.message
-      puts e.backtrace
     end
+    t2 = Thread.new do
+      begin
+        @@tech_ad_tagger.tag_company_with_technologies @@programming_languages
+      rescue Exception => e
+        puts e.message
+        puts e.backtrace
+      end
+    end
     #this is necessary, for some reason, otherwise the process just gets killed, as the sub-threads are dependent on the main thread remaining alive
     while true
       sleep(5)
@@ -71,15 +86,15 @@ class DataPopulators
   #examples:
-  # self_instance.tech_ad_tagger.tag_company_with_technologies self_instance.programming_languages
+  # @@tech_ad_tagger.tag_company_with_technologies @@programming_languages
-  # self_instance.gather_companies.load_companies_to_db 700, 0, facet_location
+  # @@gather_companies.load_companies_to_db 700, 0, @@facet_location
-  # self_instance.geo_tagger.load_geolocations_into_db
+  # @@geo_tagger.load_geolocations_into_db
-  # self_instance.geo_tagger.update_companies_with_latitude_longitude
+  # @@geo_tagger.update_companies_with_latitude_longitude
-  # near = self_instance.companies_searcher.zip_code_search "95688"
+  # near = @@companies_searcher.zip_code_search "95688"
   # near = self_instance.companies_searcher.geospatial_search -122.4099154, 37.8059887

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: whos_using_what
 version: !ruby/object:Gem::Version
-  version: 0.3.1
+  version: 0.3.3
   prerelease:
 platform: ruby
 authors:
@@ -91,6 +91,22 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: mongo
   requirement: !ruby/object:Gem::Requirement
@@ -107,6 +123,38 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: watir-webdriver
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: headless
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 description: What companies are using what technologies
 email: r.dane1010@gmail.com
 executables: []
@@ -117,9 +165,9 @@ files:
 - lib/whos_using_what/util/map_data_extraction_util.rb
 - lib/whos_using_what/logging/logger_factory.rb
 - lib/whos_using_what/scripts/data_populators.rb
+- lib/whos_using_what/api_clients/google_client.rb
 - lib/whos_using_what/api_clients/base_api_client.rb
 - lib/whos_using_what/api_clients/linkedin_client.rb
-- lib/whos_using_what/api_clients/whos_using_what_search_client.rb
 - lib/whos_using_what/api_clients/google_locations_client.rb
 - lib/whos_using_what/data_gatherers/gather_companies.rb
 - lib/whos_using_what/data_gatherers/geo_tagger.rb

data/lib/whos_using_what/api_clients/whos_using_what_search_client.rb DELETED Viewed

@@ -1,137 +0,0 @@
-require_relative "../base"
-class WhosUsingWhatSearchClient < Base
-  require "uri"
-  require "rest-client"
-  attr :results
-  def initialize()
-    @negativeMatchUrlPatterns = Array.new.push("google.com").push("youtube.com")
-    @positiveMatchUrlPatterns = Array.new.push("http")
-    @technologiesToSearchFor = Array.new.push("ruby").push("java").push("javascript").push("python").push("c++").push("c#")
-    @jobPageTokens = Array.new.push("job", "hiring", "career")
-    @results = Hash.new
-  end
-  private
-  def extractUrls (rawInput, mustContainUrl)
-    acceptedUrls = Array.new
-    if (rawInput == nil)
-      return acceptedUrls
-    end
-    urls = []
-    begin
-      urls = URI.extract(rawInput)
-    end
-    if urls.size < 1
-      return urls
-    end
-    urls.each do |url|
-      add = true
-      @negativeMatchUrlPatterns.each do |token|
-        if (nil != url.index(token))
-          add = false
-        end
-      end
-      @positiveMatchUrlPatterns.each do |token|
-        if (nil == url.index(token) || url.index(token) > 0)
-          add = false
-        end
-      end
-      if (mustContainUrl != nil && url.index(mustContainUrl) == nil)
-        add = false
-      end
-      if (add)
-        acceptedUrls.push(url)
-      end
-    end
-    acceptedUrls
-  end
-  def arraySearch(array, rawHtml)
-    rawHtml = rawHtml.downcase
-    array.each do |token|
-      if (rawHtml.index(token) != nil)
-        return true
-      end
-    end
-    return false
-  end
-  def determineIfUsesTechnology(technology, rawHtml)
-    isJobPage = arraySearch(@jobPageTokens, rawHtml)
-    return isJobPage
-  end
-  public
-  #performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
-  #If an ad exists it is returned as part of map
-  def search queries, url
-    begin
-      rawHtml = RestClient.get(url)
-    rescue
-    end
-    urls = extractUrls(rawHtml, url)
-    matching_url = nil
-    ret_map = Hash.new
-    urls.each do |cur_url|
-      begin
-        html = RestClient.get(cur_url)
-        queries.each do |query|
-          url = "https://www.google.com/search?hl=en&as_q=" << query << "&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all&as_sitesearch=" << cur_url << "&as_occt=any&safe=off&tbs=&as_filetype=&as_rights="
-          uses_technology = determineIfUsesTechnology(query, html)
-          if (uses_technology)
-            ret_map[query] = cur_url
-          end
-        end
-      rescue Exception => exception
-        #don't really care at this point, probably not worth logging as some sites just don't end up loading properly
-      end
-    end
-    ret_map
-  end
-end