whos_using_what 0.2.12 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/whos_using_what/api_clients/{search_client.rb → whos_using_what_search_client.rb} +38 -18
- data/lib/whos_using_what/data_gatherers/gather_companies.rb +2 -2
- data/lib/whos_using_what/data_gatherers/geo_tagger.rb +4 -2
- data/lib/whos_using_what/data_gatherers/tech_ad_tagger.rb +48 -0
- data/lib/whos_using_what/no_sql/mongo_helper.rb +2 -2
- data/lib/whos_using_what/scripts/data_populators.rb +76 -0
- metadata +4 -2
| @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            require_relative "../base"
         | 
| 2 2 |  | 
| 3 | 
            -
            class  | 
| 3 | 
            +
            class WhosUsingWhatSearchClient < Base
         | 
| 4 4 |  | 
| 5 5 | 
             
              require "uri"
         | 
| 6 6 | 
             
              require "rest-client"
         | 
| @@ -22,11 +22,26 @@ class SearchClient   < Base | |
| 22 22 |  | 
| 23 23 | 
             
              end
         | 
| 24 24 |  | 
| 25 | 
            +
              private
         | 
| 26 | 
            +
             | 
| 25 27 | 
             
              def extractUrls (rawInput, mustContainUrl)
         | 
| 26 28 |  | 
| 27 29 | 
             
                acceptedUrls = Array.new
         | 
| 28 30 |  | 
| 29 | 
            -
                 | 
| 31 | 
            +
                if (rawInput == nil)
         | 
| 32 | 
            +
                  return acceptedUrls
         | 
| 33 | 
            +
                end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                urls = []
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                begin
         | 
| 38 | 
            +
                  urls = URI.extract(rawInput)
         | 
| 39 | 
            +
                end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                if urls.size < 1
         | 
| 42 | 
            +
                  return urls
         | 
| 43 | 
            +
                end
         | 
| 44 | 
            +
             | 
| 30 45 | 
             
                urls.each do |url|
         | 
| 31 46 | 
             
                  add = true
         | 
| 32 47 | 
             
                  @negativeMatchUrlPatterns.each do |token|
         | 
| @@ -75,10 +90,12 @@ class SearchClient   < Base | |
| 75 90 |  | 
| 76 91 | 
             
              end
         | 
| 77 92 |  | 
| 93 | 
            +
              public
         | 
| 78 94 |  | 
| 79 | 
            -
              def search(query, site)
         | 
| 80 95 |  | 
| 81 | 
            -
             | 
| 96 | 
            +
              #performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
         | 
| 97 | 
            +
              #If an ad exists it is returned as part of map
         | 
| 98 | 
            +
              def search queries, url
         | 
| 82 99 |  | 
| 83 100 | 
             
                begin
         | 
| 84 101 | 
             
                  rawHtml = RestClient.get(url)
         | 
| @@ -86,31 +103,34 @@ class SearchClient   < Base | |
| 86 103 |  | 
| 87 104 | 
             
                end
         | 
| 88 105 |  | 
| 89 | 
            -
                urls = extractUrls(rawHtml,  | 
| 106 | 
            +
                urls = extractUrls(rawHtml, url)
         | 
| 90 107 |  | 
| 91 | 
            -
                 | 
| 108 | 
            +
                matching_url = nil
         | 
| 92 109 |  | 
| 93 | 
            -
                 | 
| 110 | 
            +
                ret_map = Hash.new
         | 
| 94 111 |  | 
| 95 112 | 
             
                urls.each do |cur_url|
         | 
| 96 113 | 
             
                  begin
         | 
| 97 114 | 
             
                    html = RestClient.get(cur_url)
         | 
| 98 | 
            -
             | 
| 99 | 
            -
                     | 
| 100 | 
            -
             | 
| 101 | 
            -
                       | 
| 102 | 
            -
             | 
| 115 | 
            +
             | 
| 116 | 
            +
                    queries.each do |query|
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                      url = "https://www.google.com/search?hl=en&as_q=" << query << "&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all&as_sitesearch=" << cur_url << "&as_occt=any&safe=off&tbs=&as_filetype=&as_rights="
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                      uses_technology = determineIfUsesTechnology(query, html)
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                      if (uses_technology)
         | 
| 123 | 
            +
                        ret_map[query] = cur_url
         | 
| 124 | 
            +
                      end
         | 
| 125 | 
            +
             | 
| 103 126 | 
             
                    end
         | 
| 127 | 
            +
             | 
| 104 128 | 
             
                  rescue Exception => exception
         | 
| 105 | 
            -
                    # | 
| 129 | 
            +
                    #don't really care at this point, probably not worth logging as some sites just don't end up loading properly
         | 
| 106 130 | 
             
                  end
         | 
| 107 131 | 
             
                end
         | 
| 108 132 |  | 
| 109 | 
            -
                 | 
| 110 | 
            -
                  return false
         | 
| 111 | 
            -
                end
         | 
| 112 | 
            -
             | 
| 113 | 
            -
                return isMatch
         | 
| 133 | 
            +
                ret_map
         | 
| 114 134 | 
             
              end
         | 
| 115 135 |  | 
| 116 136 | 
             
            end
         | 
| @@ -21,7 +21,7 @@ class GatherCompanies < Base | |
| 21 21 |  | 
| 22 22 | 
             
              end
         | 
| 23 23 |  | 
| 24 | 
            -
              def load_companies_to_db num_iterations, cur_start_position
         | 
| 24 | 
            +
              def load_companies_to_db num_iterations, cur_start_position, facet_location_code
         | 
| 25 25 |  | 
| 26 26 | 
             
                increment = 20
         | 
| 27 27 | 
             
                cnt = 1
         | 
| @@ -32,7 +32,7 @@ class GatherCompanies < Base | |
| 32 32 | 
             
                  resp = @@linkedin_client.query_companies ({
         | 
| 33 33 | 
             
                      "start" => cur_start_position.to_s << "&count=" << increment.to_s,
         | 
| 34 34 | 
             
                      "facet=industry" => @linkedin_tech_industry_codes,
         | 
| 35 | 
            -
                      " | 
| 35 | 
            +
                      "facet=location"=> facet_location_code
         | 
| 36 36 | 
             
                  })
         | 
| 37 37 | 
             
                  docs = resp['companies'].values[3]
         | 
| 38 38 | 
             
                  if docs != nil
         | 
| @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            require_relative "../base"
         | 
| 2 2 |  | 
| 3 | 
            -
            class GeoTagger | 
| 3 | 
            +
            class GeoTagger < Base
         | 
| 4 4 |  | 
| 5 5 | 
             
              require 'mongo_helper'
         | 
| 6 6 | 
             
              require 'map_data_extraction_util'
         | 
| @@ -97,7 +97,9 @@ class GeoTagger  < Base | |
| 97 97 |  | 
| 98 98 | 
             
              def load_geolocations_into_db
         | 
| 99 99 |  | 
| 100 | 
            -
                @companies_coll.find() | 
| 100 | 
            +
                companies = @companies_coll.find()
         | 
| 101 | 
            +
                companies_arr = companies.to_a
         | 
| 102 | 
            +
                companies_arr.each do |company|
         | 
| 101 103 |  | 
| 102 104 | 
             
                  if !company
         | 
| 103 105 | 
             
                    next
         | 
| @@ -0,0 +1,48 @@ | |
| 1 | 
            +
            require_relative "../base"
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            class TechAdTagger < Base
         | 
| 4 | 
            +
             | 
| 5 | 
            +
              require 'whos_using_what_search_client'
         | 
| 6 | 
            +
             | 
| 7 | 
            +
             | 
| 8 | 
            +
              def initialize
         | 
| 9 | 
            +
                @search_client = WhosUsingWhatSearchClient.new
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                @mongo_client = MongoHelper.get_mongo_connection
         | 
| 12 | 
            +
                @companies_coll = @mongo_client['companies']
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
              end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
             | 
| 18 | 
            +
              #iterates through array and updates company db record with technologies found from ads from their website
         | 
| 19 | 
            +
              def tag_company_with_technologies tech_keywords
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                companies = @companies_coll.find(
         | 
| 22 | 
            +
                    "languages" => {"$exists" => false}
         | 
| 23 | 
            +
                )
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                languages = Hash.new
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                companies.each do |company|
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                  languages = Hash.new
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                  company_languages_map = @search_client.search tech_keywords, company["websiteUrl"]
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                  company_languages_map.each do |key, value|
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    languages[key] = value
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                  end
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                  company['languages'] = languages
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                  @companies_coll.update({"_id" => company["_id"]}, company)
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                  puts "updating: " << company.to_s
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                end
         | 
| 46 | 
            +
              end
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            end
         | 
| @@ -1,7 +1,5 @@ | |
| 1 1 | 
             
            require_relative '../base'
         | 
| 2 2 |  | 
| 3 | 
            -
            include Mongo
         | 
| 4 | 
            -
             | 
| 5 3 | 
             
            class MongoHelper < Base
         | 
| 6 4 |  | 
| 7 5 | 
             
              require 'mongo'
         | 
| @@ -9,6 +7,8 @@ class MongoHelper < Base | |
| 9 7 | 
             
              require 'json'
         | 
| 10 8 | 
             
              require 'yaml'
         | 
| 11 9 |  | 
| 10 | 
            +
              include Mongo
         | 
| 11 | 
            +
             | 
| 12 12 | 
             
              def self.get_connection
         | 
| 13 13 | 
             
                return @db_connection if @db_connection
         | 
| 14 14 | 
             
                db = URI.parse(ENV["mongo.uri"].strip)
         | 
| @@ -0,0 +1,76 @@ | |
| 1 | 
            +
            class DataPopulators
         | 
| 2 | 
            +
             | 
| 3 | 
            +
              private
         | 
| 4 | 
            +
             | 
| 5 | 
            +
              def self.load_class_paths
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                $:.unshift(File.expand_path('../../data_gatherers', __FILE__))
         | 
| 8 | 
            +
                $:.unshift(File.expand_path('../../data_searchers', __FILE__))
         | 
| 9 | 
            +
                $:.unshift(File.expand_path('../../logging', __FILE__))
         | 
| 10 | 
            +
             | 
| 11 | 
            +
              end
         | 
| 12 | 
            +
             | 
| 13 | 
            +
              #must be after load_class_paths is called
         | 
| 14 | 
            +
              def self.initial_requires
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                require 'geo_tagger'
         | 
| 17 | 
            +
                require 'companies_searcher'
         | 
| 18 | 
            +
                require 'logger_factory'
         | 
| 19 | 
            +
                require 'gather_companies'
         | 
| 20 | 
            +
                require 'tech_ad_tagger'
         | 
| 21 | 
            +
             | 
| 22 | 
            +
              end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
             | 
| 25 | 
            +
              def self.static_initialize
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                #this must be called first
         | 
| 28 | 
            +
                load_class_paths
         | 
| 29 | 
            +
                #then this
         | 
| 30 | 
            +
                initial_requires
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                #objects
         | 
| 33 | 
            +
                @@log = LoggerFactory.get_default_logger
         | 
| 34 | 
            +
                @@geo_tagger = GeoTagger.new @@log
         | 
| 35 | 
            +
                @@gather_companies = GatherCompanies.new
         | 
| 36 | 
            +
                @@companies_searcher = CompaniesSearcher.new @@geo_tagger
         | 
| 37 | 
            +
                @@ech_ad_tagger = TechAdTagger.new
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                #data holders
         | 
| 40 | 
            +
                @@facet_location = "us:82" #Sacramento
         | 
| 41 | 
            +
                @@programming_languages = ["java", "ruby", "c#", "php", "python", "javascript"]
         | 
| 42 | 
            +
             | 
| 43 | 
            +
             | 
| 44 | 
            +
              end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
              static_initialize
         | 
| 47 | 
            +
             | 
| 48 | 
            +
              if __FILE__ == $PROGRAM_NAME
         | 
| 49 | 
            +
             | 
| 50 | 
            +
             | 
| 51 | 
            +
                begin
         | 
| 52 | 
            +
                  @@geo_tagger.load_geolocations_into_db
         | 
| 53 | 
            +
                rescue Exception => e
         | 
| 54 | 
            +
                  puts e.message
         | 
| 55 | 
            +
                  puts e.backtrace
         | 
| 56 | 
            +
                end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
             | 
| 59 | 
            +
              end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
              #examples:
         | 
| 62 | 
            +
             | 
| 63 | 
            +
              # self_instance.tech_ad_tagger.tag_company_with_technologies self_instance.programming_languages
         | 
| 64 | 
            +
             | 
| 65 | 
            +
              # self_instance.gather_companies.load_companies_to_db 700, 0, facet_location
         | 
| 66 | 
            +
             | 
| 67 | 
            +
              # self_instance.geo_tagger.load_geolocations_into_db
         | 
| 68 | 
            +
             | 
| 69 | 
            +
              # self_instance.geo_tagger.update_companies_with_latitude_longitude
         | 
| 70 | 
            +
             | 
| 71 | 
            +
              # near = self_instance.companies_searcher.zip_code_search "95688"
         | 
| 72 | 
            +
             | 
| 73 | 
            +
              # near = self_instance.companies_searcher.geospatial_search -122.4099154, 37.8059887
         | 
| 74 | 
            +
             | 
| 75 | 
            +
             | 
| 76 | 
            +
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: whos_using_what
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.3.0
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
            platform: ruby
         | 
| 7 7 | 
             
            authors:
         | 
| @@ -116,12 +116,14 @@ files: | |
| 116 116 | 
             
            - lib/whos_using_what/data_searchers/companies_searcher.rb
         | 
| 117 117 | 
             
            - lib/whos_using_what/util/map_data_extraction_util.rb
         | 
| 118 118 | 
             
            - lib/whos_using_what/logging/logger_factory.rb
         | 
| 119 | 
            +
            - lib/whos_using_what/scripts/data_populators.rb
         | 
| 119 120 | 
             
            - lib/whos_using_what/api_clients/base_api_client.rb
         | 
| 120 | 
            -
            - lib/whos_using_what/api_clients/search_client.rb
         | 
| 121 121 | 
             
            - lib/whos_using_what/api_clients/linkedin_client.rb
         | 
| 122 | 
            +
            - lib/whos_using_what/api_clients/whos_using_what_search_client.rb
         | 
| 122 123 | 
             
            - lib/whos_using_what/api_clients/google_locations_client.rb
         | 
| 123 124 | 
             
            - lib/whos_using_what/data_gatherers/gather_companies.rb
         | 
| 124 125 | 
             
            - lib/whos_using_what/data_gatherers/geo_tagger.rb
         | 
| 126 | 
            +
            - lib/whos_using_what/data_gatherers/tech_ad_tagger.rb
         | 
| 125 127 | 
             
            - lib/whos_using_what/no_sql/mongo_helper.rb
         | 
| 126 128 | 
             
            - lib/whos_using_what/base.rb
         | 
| 127 129 | 
             
            homepage: http://rubygems.org/gems/whos_using_what
         |