RubyGems - whos_using_what - Versions diffs - 0.3.1 → 0.3.3 - Mend

whos_using_what 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/lib/whos_using_what/api_clients/base_api_client.rb +54 -0
data/lib/whos_using_what/api_clients/google_client.rb +177 -0
data/lib/whos_using_what/base.rb +5 -1
data/lib/whos_using_what/data_gatherers/tech_ad_tagger.rb +15 -6
data/lib/whos_using_what/scripts/data_populators.rb +28 -13
metadata +50 -2
data/lib/whos_using_what/api_clients/whos_using_what_search_client.rb +0 -137

data/lib/whos_using_what/api_clients/base_api_client.rb CHANGED Viewed

@@ -2,6 +2,60 @@ require_relative "../base"
 class BaseApiClient < Base
+  require "uri"
+  require "rest-client"
+  def arraySearch(array, rawHtml)
+    rawHtml = rawHtml.downcase
+    array.each do |token|
+      if (rawHtml.index(token) != nil)
+        return true
+      end
+    end
+    return false
+  end
+  def arry_to_str_delim array, delim
+    str = ""
+    i = 0
+    array.each do |entry|
+      if i < 1
+        str = entry.strip
+      else
+        str = str << delim << entry.strip
+      end
+      i += 1
+    end
+    str.strip
+  end
+  def cleanup_url url
+    #clean up url
+    url = url.strip
+    if url["www."] != nil
+      url["www."] = ""
+    end
+    if url["site:"] != nil
+      url["site:"] = ""
+    end
+    url
+  end
+  def determineIfUsesTechnology(technology, rawHtml)
+    isJobPage = arraySearch(@jobPageTokens, rawHtml)
+    return isJobPage
+  end
   def starts_with?(string, prefix)
     prefix = prefix.to_s
     string[0, prefix.length] == prefix

data/lib/whos_using_what/api_clients/google_client.rb ADDED Viewed

@@ -0,0 +1,177 @@
+require_relative "base_api_client"
+require 'mechanize'
+require 'watir-webdriver'
+require 'headless'
+class GoogleClient < BaseApiClient
+  attr :results
+  def initialize
+    @negativeMatchUrlPatterns = ['google', 'youtube', 'duckduckgo', 'bing', 'yahoo']
+    @positiveMatchUrlPatterns = ['http', 'www']
+    @technologiesToSearchFor = ['ruby', 'java', 'javascript', 'python']
+    @jobPageTokens = ['job', 'hiring', 'career']
+    @results = Hash.new
+    @mechanize = Mechanize.new
+    headless = Headless.new
+    headless.start
+    @browser = Watir::Browser.new :firefox
+  end
+  def extractUrls (rawInput, mustContainUrl)
+    acceptedUrls = Array.new
+    if (rawInput == nil)
+      return acceptedUrls
+    end
+    urls = []
+    begin
+      urls = URI.extract(rawInput)
+    end
+    if urls.size < 1
+      return acceptedUrls
+    end
+    mustContainUrl = cleanup_url mustContainUrl
+    urls.each do |url|
+      url = cleanup_url url
+      accept_url_bool = false
+      @positiveMatchUrlPatterns.each do |token|
+        if (starts_with? url, token) ||
+            (starts_with? url, mustContainUrl)
+          accept_url_bool = true
+          break
+        end
+      end
+      if !accept_url_bool
+        next
+      end
+      if !(url.include? mustContainUrl)
+        accept_url_bool = false
+      end
+      @negativeMatchUrlPatterns.each do |token|
+        if url.include? token
+          accept_url_bool = false
+          break
+        end
+      end
+      url = cleanup_url url
+      if accept_url_bool &&
+          url != nil && !(acceptedUrls.include? url)
+        acceptedUrls.push url
+      end
+    end
+    acceptedUrls
+  end
+  def generate_google_url site_url, search_keyword
+    query_url = [
+        "http://www.google.com/search?",
+        "hl=en",
+        "&as_q=" << "hiring+" << search_keyword,
+        "&as_sitesearch=" << (cleanup_url (site_url))
+    ]
+    url = arry_to_str_delim query_url, ""
+  end
+  def generate_duckduckgo_url site_url, search_keyword
+    query_url = [
+        "http://duckduckgo.com/?",
+        "q=" <<
+            "site:" << (cleanup_url (site_url)) <<
+            "+hiring+" << search_keyword,
+    ]
+    url = arry_to_str_delim query_url, ""
+  end
+  #performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
+  #If an ad exists it is returned as part of map
+  def google_search queries, site_url
+    ret_map = Hash.new
+    @technologiesToSearchFor.each do |search_keyword|
+      url = ""
+      raw_html = ""
+      begin
+        url = generate_duckduckgo_url site_url, search_keyword
+        #perform initial search engine search
+        @browser.goto url
+        raw_html = @browser.html
+        puts "successfully queried url:" << url
+      rescue Exception => e
+        puts "exception:" << e.message << " when querying url: " << url
+      end
+      urls = extractUrls(raw_html, site_url)
+      urls.each do |cur_url|
+        begin
+          @browser.goto cur_url
+          html = @browser.html
+          #strip all html tags, for human readability and to cut down on some errors that could arise
+          # TODO this was causing an exception
+          #  html = html.gsub!(/(<[^>]*>)|\n|\t/s) { " " }
+          uses_technology = determineIfUsesTechnology(search_keyword, html)
+          if (uses_technology)
+            ret_map[search_keyword] = cur_url
+          end
+        rescue Exception => e
+          puts e.message
+        end
+      end
+    end
+    #throttle queries to avoid being black-listed by search engine
+    sleep_seconds = rand(1-5)
+    sleep sleep_seconds
+    ret_map
+  end
+end

data/lib/whos_using_what/base.rb CHANGED Viewed

@@ -1,5 +1,9 @@
 class Base
+  attr :set_paths
+  @@paths_set = false
   def self.set_paths
     $:.unshift(File.expand_path('../data_gatherers', __FILE__))
     $:.unshift(File.expand_path('../data_searchers', __FILE__))
@@ -8,9 +12,9 @@ class Base
     $:.unshift(File.expand_path('../util', __FILE__))
     $:.unshift(File.expand_path('../logging', __FILE__))
+    @@paths_set = true
   end
   set_paths
 end

data/lib/whos_using_what/data_gatherers/tech_ad_tagger.rb CHANGED Viewed

@@ -2,11 +2,11 @@ require_relative "../base"
 class TechAdTagger < Base
-  require 'whos_using_what_search_client'
+  def initialize
+    require_relative '../api_clients/google_client'
-  def initialize
-    @search_client = WhosUsingWhatSearchClient.new
+    @search_client = GoogleClient.new
     @mongo_client = MongoHelper.get_mongo_connection
     @companies_coll = @mongo_client['companies']
@@ -18,9 +18,18 @@ class TechAdTagger < Base
   #iterates through array and updates company db record with technologies found from ads from their website
   def tag_company_with_technologies tech_keywords
+    # uncomment if need to clear out all existing technologies
+=begin
+    @companies_coll.find().each do |company|
+      company['languages'] = {}
+      @companies_coll.update({"_id" => company["_id"]}, company)
+    end
+=end
     companies = @companies_coll.find(
-        "languages" => {"$exists" => false}
-    )
+        # "languages" => {"$exists" => false}
+        # "languages" => {}
+    ).to_a
     languages = Hash.new
@@ -28,7 +37,7 @@ class TechAdTagger < Base
       languages = Hash.new
-      company_languages_map = @search_client.search tech_keywords, company["websiteUrl"]
+      company_languages_map = @search_client.google_search tech_keywords, company["websiteUrl"]
       company_languages_map.each do |key, value|

data/lib/whos_using_what/scripts/data_populators.rb CHANGED Viewed

@@ -34,10 +34,10 @@ class DataPopulators
     @@geo_tagger = GeoTagger.new @@log
     @@gather_companies = GatherCompanies.new
     @@companies_searcher = CompaniesSearcher.new @@geo_tagger
-    @@ech_ad_tagger = TechAdTagger.new
+    @@tech_ad_tagger = TechAdTagger.new
     #data holders
-    @@facet_location = "us:82" #Sacramento
+    @@facet_location = "us:84"
     @@programming_languages = ["java", "ruby", "c#", "php", "python", "javascript"]
@@ -48,19 +48,34 @@ class DataPopulators
   if __FILE__ == $PROGRAM_NAME
-    begin
+    t1 = Thread.new do
-      t1 = Thread.new do
+      begin
-        @@geo_tagger.load_geolocations_into_db
+       # @@gather_companies.load_companies_to_db 700, 0, @@facet_location
+      rescue Exception => e
+        puts e.message
+        puts e.backtrace
       end
-    rescue Exception => e
-      puts e.message
-      puts e.backtrace
     end
+    t2 = Thread.new do
+      begin
+        @@tech_ad_tagger.tag_company_with_technologies @@programming_languages
+      rescue Exception => e
+        puts e.message
+        puts e.backtrace
+      end
+    end
     #this is necessary, for some reason, otherwise the process just gets killed, as the sub-threads are dependent on the main thread remaining alive
     while true
       sleep(5)
@@ -71,15 +86,15 @@ class DataPopulators
   #examples:
-  # self_instance.tech_ad_tagger.tag_company_with_technologies self_instance.programming_languages
+  # @@tech_ad_tagger.tag_company_with_technologies @@programming_languages
-  # self_instance.gather_companies.load_companies_to_db 700, 0, facet_location
+  # @@gather_companies.load_companies_to_db 700, 0, @@facet_location
-  # self_instance.geo_tagger.load_geolocations_into_db
+  # @@geo_tagger.load_geolocations_into_db
-  # self_instance.geo_tagger.update_companies_with_latitude_longitude
+  # @@geo_tagger.update_companies_with_latitude_longitude
-  # near = self_instance.companies_searcher.zip_code_search "95688"
+  # near = @@companies_searcher.zip_code_search "95688"
   # near = self_instance.companies_searcher.geospatial_search -122.4099154, 37.8059887

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: whos_using_what
 version: !ruby/object:Gem::Version
-  version: 0.3.1
+  version: 0.3.3
   prerelease:
 platform: ruby
 authors:
@@ -91,6 +91,22 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: mongo
   requirement: !ruby/object:Gem::Requirement
@@ -107,6 +123,38 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: watir-webdriver
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: headless
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 description: What companies are using what technologies
 email: r.dane1010@gmail.com
 executables: []
@@ -117,9 +165,9 @@ files:
 - lib/whos_using_what/util/map_data_extraction_util.rb
 - lib/whos_using_what/logging/logger_factory.rb
 - lib/whos_using_what/scripts/data_populators.rb
+- lib/whos_using_what/api_clients/google_client.rb
 - lib/whos_using_what/api_clients/base_api_client.rb
 - lib/whos_using_what/api_clients/linkedin_client.rb
-- lib/whos_using_what/api_clients/whos_using_what_search_client.rb
 - lib/whos_using_what/api_clients/google_locations_client.rb
 - lib/whos_using_what/data_gatherers/gather_companies.rb
 - lib/whos_using_what/data_gatherers/geo_tagger.rb

data/lib/whos_using_what/api_clients/whos_using_what_search_client.rb DELETED Viewed

@@ -1,137 +0,0 @@
-require_relative "../base"
-class WhosUsingWhatSearchClient < Base
-  require "uri"
-  require "rest-client"
-  attr :results
-  def initialize()
-    @negativeMatchUrlPatterns = Array.new.push("google.com").push("youtube.com")
-    @positiveMatchUrlPatterns = Array.new.push("http")
-    @technologiesToSearchFor = Array.new.push("ruby").push("java").push("javascript").push("python").push("c++").push("c#")
-    @jobPageTokens = Array.new.push("job", "hiring", "career")
-    @results = Hash.new
-  end
-  private
-  def extractUrls (rawInput, mustContainUrl)
-    acceptedUrls = Array.new
-    if (rawInput == nil)
-      return acceptedUrls
-    end
-    urls = []
-    begin
-      urls = URI.extract(rawInput)
-    end
-    if urls.size < 1
-      return urls
-    end
-    urls.each do |url|
-      add = true
-      @negativeMatchUrlPatterns.each do |token|
-        if (nil != url.index(token))
-          add = false
-        end
-      end
-      @positiveMatchUrlPatterns.each do |token|
-        if (nil == url.index(token) || url.index(token) > 0)
-          add = false
-        end
-      end
-      if (mustContainUrl != nil && url.index(mustContainUrl) == nil)
-        add = false
-      end
-      if (add)
-        acceptedUrls.push(url)
-      end
-    end
-    acceptedUrls
-  end
-  def arraySearch(array, rawHtml)
-    rawHtml = rawHtml.downcase
-    array.each do |token|
-      if (rawHtml.index(token) != nil)
-        return true
-      end
-    end
-    return false
-  end
-  def determineIfUsesTechnology(technology, rawHtml)
-    isJobPage = arraySearch(@jobPageTokens, rawHtml)
-    return isJobPage
-  end
-  public
-  #performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
-  #If an ad exists it is returned as part of map
-  def search queries, url
-    begin
-      rawHtml = RestClient.get(url)
-    rescue
-    end
-    urls = extractUrls(rawHtml, url)
-    matching_url = nil
-    ret_map = Hash.new
-    urls.each do |cur_url|
-      begin
-        html = RestClient.get(cur_url)
-        queries.each do |query|
-          url = "https://www.google.com/search?hl=en&as_q=" << query << "&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all&as_sitesearch=" << cur_url << "&as_occt=any&safe=off&tbs=&as_filetype=&as_rights="
-          uses_technology = determineIfUsesTechnology(query, html)
-          if (uses_technology)
-            ret_map[query] = cur_url
-          end
-        end
-      rescue Exception => exception
-        #don't really care at this point, probably not worth logging as some sites just don't end up loading properly
-      end
-    end
-    ret_map
-  end
-end