whos_using_what 0.2.12 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  require_relative "../base"
2
2
 
3
- class SearchClient < Base
3
+ class WhosUsingWhatSearchClient < Base
4
4
 
5
5
  require "uri"
6
6
  require "rest-client"
@@ -22,11 +22,26 @@ class SearchClient < Base
22
22
 
23
23
  end
24
24
 
25
+ private
26
+
25
27
  def extractUrls (rawInput, mustContainUrl)
26
28
 
27
29
  acceptedUrls = Array.new
28
30
 
29
- urls = URI.extract(rawInput)
31
+ if (rawInput == nil)
32
+ return acceptedUrls
33
+ end
34
+
35
+ urls = []
36
+
37
+ begin
38
+ urls = URI.extract(rawInput)
39
+ end
40
+
41
+ if urls.size < 1
42
+ return urls
43
+ end
44
+
30
45
  urls.each do |url|
31
46
  add = true
32
47
  @negativeMatchUrlPatterns.each do |token|
@@ -75,10 +90,12 @@ class SearchClient < Base
75
90
 
76
91
  end
77
92
 
93
+ public
78
94
 
79
- def search(query, site)
80
95
 
81
- url = "https://www.google.com/search?hl=en&as_q=" << query << "&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all&as_sitesearch=" << site << "&as_occt=any&safe=off&tbs=&as_filetype=&as_rights="
96
+ #performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
97
+ #If an ad exists it is returned as part of map
98
+ def search queries, url
82
99
 
83
100
  begin
84
101
  rawHtml = RestClient.get(url)
@@ -86,31 +103,34 @@ class SearchClient < Base
86
103
 
87
104
  end
88
105
 
89
- urls = extractUrls(rawHtml, site)
106
+ urls = extractUrls(rawHtml, url)
90
107
 
91
- isMatch = false
108
+ matching_url = nil
92
109
 
93
- at_least_one_nonexception_url = false
110
+ ret_map = Hash.new
94
111
 
95
112
  urls.each do |cur_url|
96
113
  begin
97
114
  html = RestClient.get(cur_url)
98
- uses_technology = determineIfUsesTechnology(query, html)
99
- at_least_one_nonexception_url = true
100
- if (uses_technology)
101
- isMatch = true
102
- break
115
+
116
+ queries.each do |query|
117
+
118
+ url = "https://www.google.com/search?hl=en&as_q=" << query << "&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all&as_sitesearch=" << cur_url << "&as_occt=any&safe=off&tbs=&as_filetype=&as_rights="
119
+
120
+ uses_technology = determineIfUsesTechnology(query, html)
121
+
122
+ if (uses_technology)
123
+ ret_map[query] = cur_url
124
+ end
125
+
103
126
  end
127
+
104
128
  rescue Exception => exception
105
- #raise exception
129
+ #don't really care at this point, probably not worth logging as some sites just don't end up loading properly
106
130
  end
107
131
  end
108
132
 
109
- if (!at_least_one_nonexception_url)
110
- return false
111
- end
112
-
113
- return isMatch
133
+ ret_map
114
134
  end
115
135
 
116
136
  end
@@ -21,7 +21,7 @@ class GatherCompanies < Base
21
21
 
22
22
  end
23
23
 
24
- def load_companies_to_db num_iterations, cur_start_position
24
+ def load_companies_to_db num_iterations, cur_start_position, facet_location_code
25
25
 
26
26
  increment = 20
27
27
  cnt = 1
@@ -32,7 +32,7 @@ class GatherCompanies < Base
32
32
  resp = @@linkedin_client.query_companies ({
33
33
  "start" => cur_start_position.to_s << "&count=" << increment.to_s,
34
34
  "facet=industry" => @linkedin_tech_industry_codes,
35
- "locations:(address:(postal-code))" => "95688"
35
+ "facet=location"=> facet_location_code
36
36
  })
37
37
  docs = resp['companies'].values[3]
38
38
  if docs != nil
@@ -1,6 +1,6 @@
1
1
  require_relative "../base"
2
2
 
3
- class GeoTagger < Base
3
+ class GeoTagger < Base
4
4
 
5
5
  require 'mongo_helper'
6
6
  require 'map_data_extraction_util'
@@ -97,7 +97,9 @@ class GeoTagger < Base
97
97
 
98
98
  def load_geolocations_into_db
99
99
 
100
- @companies_coll.find().to_a.each do |company|
100
+ companies = @companies_coll.find()
101
+ companies_arr = companies.to_a
102
+ companies_arr.each do |company|
101
103
 
102
104
  if !company
103
105
  next
@@ -0,0 +1,48 @@
1
+ require_relative "../base"
2
+
3
+ class TechAdTagger < Base
4
+
5
+ require 'whos_using_what_search_client'
6
+
7
+
8
+ def initialize
9
+ @search_client = WhosUsingWhatSearchClient.new
10
+
11
+ @mongo_client = MongoHelper.get_mongo_connection
12
+ @companies_coll = @mongo_client['companies']
13
+
14
+
15
+ end
16
+
17
+
18
+ #iterates through array and updates company db record with technologies found from ads from their website
19
+ def tag_company_with_technologies tech_keywords
20
+
21
+ companies = @companies_coll.find(
22
+ "languages" => {"$exists" => false}
23
+ )
24
+
25
+ languages = Hash.new
26
+
27
+ companies.each do |company|
28
+
29
+ languages = Hash.new
30
+
31
+ company_languages_map = @search_client.search tech_keywords, company["websiteUrl"]
32
+
33
+ company_languages_map.each do |key, value|
34
+
35
+ languages[key] = value
36
+
37
+ end
38
+
39
+ company['languages'] = languages
40
+
41
+ @companies_coll.update({"_id" => company["_id"]}, company)
42
+
43
+ puts "updating: " << company.to_s
44
+
45
+ end
46
+ end
47
+
48
+ end
@@ -1,7 +1,5 @@
1
1
  require_relative '../base'
2
2
 
3
- include Mongo
4
-
5
3
  class MongoHelper < Base
6
4
 
7
5
  require 'mongo'
@@ -9,6 +7,8 @@ class MongoHelper < Base
9
7
  require 'json'
10
8
  require 'yaml'
11
9
 
10
+ include Mongo
11
+
12
12
  def self.get_connection
13
13
  return @db_connection if @db_connection
14
14
  db = URI.parse(ENV["mongo.uri"].strip)
@@ -0,0 +1,76 @@
1
+ class DataPopulators
2
+
3
+ private
4
+
5
+ def self.load_class_paths
6
+
7
+ $:.unshift(File.expand_path('../../data_gatherers', __FILE__))
8
+ $:.unshift(File.expand_path('../../data_searchers', __FILE__))
9
+ $:.unshift(File.expand_path('../../logging', __FILE__))
10
+
11
+ end
12
+
13
+ #must be after load_class_paths is called
14
+ def self.initial_requires
15
+
16
+ require 'geo_tagger'
17
+ require 'companies_searcher'
18
+ require 'logger_factory'
19
+ require 'gather_companies'
20
+ require 'tech_ad_tagger'
21
+
22
+ end
23
+
24
+
25
+ def self.static_initialize
26
+
27
+ #this must be called first
28
+ load_class_paths
29
+ #then this
30
+ initial_requires
31
+
32
+ #objects
33
+ @@log = LoggerFactory.get_default_logger
34
+ @@geo_tagger = GeoTagger.new @@log
35
+ @@gather_companies = GatherCompanies.new
36
+ @@companies_searcher = CompaniesSearcher.new @@geo_tagger
37
+ @@ech_ad_tagger = TechAdTagger.new
38
+
39
+ #data holders
40
+ @@facet_location = "us:82" #Sacramento
41
+ @@programming_languages = ["java", "ruby", "c#", "php", "python", "javascript"]
42
+
43
+
44
+ end
45
+
46
+ static_initialize
47
+
48
+ if __FILE__ == $PROGRAM_NAME
49
+
50
+
51
+ begin
52
+ @@geo_tagger.load_geolocations_into_db
53
+ rescue Exception => e
54
+ puts e.message
55
+ puts e.backtrace
56
+ end
57
+
58
+
59
+ end
60
+
61
+ #examples:
62
+
63
+ # self_instance.tech_ad_tagger.tag_company_with_technologies self_instance.programming_languages
64
+
65
+ # self_instance.gather_companies.load_companies_to_db 700, 0, facet_location
66
+
67
+ # self_instance.geo_tagger.load_geolocations_into_db
68
+
69
+ # self_instance.geo_tagger.update_companies_with_latitude_longitude
70
+
71
+ # near = self_instance.companies_searcher.zip_code_search "95688"
72
+
73
+ # near = self_instance.companies_searcher.geospatial_search -122.4099154, 37.8059887
74
+
75
+
76
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whos_using_what
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.12
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -116,12 +116,14 @@ files:
116
116
  - lib/whos_using_what/data_searchers/companies_searcher.rb
117
117
  - lib/whos_using_what/util/map_data_extraction_util.rb
118
118
  - lib/whos_using_what/logging/logger_factory.rb
119
+ - lib/whos_using_what/scripts/data_populators.rb
119
120
  - lib/whos_using_what/api_clients/base_api_client.rb
120
- - lib/whos_using_what/api_clients/search_client.rb
121
121
  - lib/whos_using_what/api_clients/linkedin_client.rb
122
+ - lib/whos_using_what/api_clients/whos_using_what_search_client.rb
122
123
  - lib/whos_using_what/api_clients/google_locations_client.rb
123
124
  - lib/whos_using_what/data_gatherers/gather_companies.rb
124
125
  - lib/whos_using_what/data_gatherers/geo_tagger.rb
126
+ - lib/whos_using_what/data_gatherers/tech_ad_tagger.rb
125
127
  - lib/whos_using_what/no_sql/mongo_helper.rb
126
128
  - lib/whos_using_what/base.rb
127
129
  homepage: http://rubygems.org/gems/whos_using_what