whos_using_what 0.2.12 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  require_relative "../base"
2
2
 
3
- class SearchClient < Base
3
+ class WhosUsingWhatSearchClient < Base
4
4
 
5
5
  require "uri"
6
6
  require "rest-client"
@@ -22,11 +22,26 @@ class SearchClient < Base
22
22
 
23
23
  end
24
24
 
25
+ private
26
+
25
27
  def extractUrls (rawInput, mustContainUrl)
26
28
 
27
29
  acceptedUrls = Array.new
28
30
 
29
- urls = URI.extract(rawInput)
31
+ if (rawInput == nil)
32
+ return acceptedUrls
33
+ end
34
+
35
+ urls = []
36
+
37
+ begin
38
+ urls = URI.extract(rawInput)
39
+ end
40
+
41
+ if urls.size < 1
42
+ return urls
43
+ end
44
+
30
45
  urls.each do |url|
31
46
  add = true
32
47
  @negativeMatchUrlPatterns.each do |token|
@@ -75,10 +90,12 @@ class SearchClient < Base
75
90
 
76
91
  end
77
92
 
93
+ public
78
94
 
79
- def search(query, site)
80
95
 
81
- url = "https://www.google.com/search?hl=en&as_q=" << query << "&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all&as_sitesearch=" << site << "&as_occt=any&safe=off&tbs=&as_filetype=&as_rights="
96
+ #performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
97
+ #If an ad exists it is returned as part of map
98
+ def search queries, url
82
99
 
83
100
  begin
84
101
  rawHtml = RestClient.get(url)
@@ -86,31 +103,34 @@ class SearchClient < Base
86
103
 
87
104
  end
88
105
 
89
- urls = extractUrls(rawHtml, site)
106
+ urls = extractUrls(rawHtml, url)
90
107
 
91
- isMatch = false
108
+ matching_url = nil
92
109
 
93
- at_least_one_nonexception_url = false
110
+ ret_map = Hash.new
94
111
 
95
112
  urls.each do |cur_url|
96
113
  begin
97
114
  html = RestClient.get(cur_url)
98
- uses_technology = determineIfUsesTechnology(query, html)
99
- at_least_one_nonexception_url = true
100
- if (uses_technology)
101
- isMatch = true
102
- break
115
+
116
+ queries.each do |query|
117
+
118
+ url = "https://www.google.com/search?hl=en&as_q=" << query << "&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all&as_sitesearch=" << cur_url << "&as_occt=any&safe=off&tbs=&as_filetype=&as_rights="
119
+
120
+ uses_technology = determineIfUsesTechnology(query, html)
121
+
122
+ if (uses_technology)
123
+ ret_map[query] = cur_url
124
+ end
125
+
103
126
  end
127
+
104
128
  rescue Exception => exception
105
- #raise exception
129
+ #don't really care at this point, probably not worth logging as some sites just don't end up loading properly
106
130
  end
107
131
  end
108
132
 
109
- if (!at_least_one_nonexception_url)
110
- return false
111
- end
112
-
113
- return isMatch
133
+ ret_map
114
134
  end
115
135
 
116
136
  end
@@ -21,7 +21,7 @@ class GatherCompanies < Base
21
21
 
22
22
  end
23
23
 
24
- def load_companies_to_db num_iterations, cur_start_position
24
+ def load_companies_to_db num_iterations, cur_start_position, facet_location_code
25
25
 
26
26
  increment = 20
27
27
  cnt = 1
@@ -32,7 +32,7 @@ class GatherCompanies < Base
32
32
  resp = @@linkedin_client.query_companies ({
33
33
  "start" => cur_start_position.to_s << "&count=" << increment.to_s,
34
34
  "facet=industry" => @linkedin_tech_industry_codes,
35
- "locations:(address:(postal-code))" => "95688"
35
+ "facet=location"=> facet_location_code
36
36
  })
37
37
  docs = resp['companies'].values[3]
38
38
  if docs != nil
@@ -1,6 +1,6 @@
1
1
  require_relative "../base"
2
2
 
3
- class GeoTagger < Base
3
+ class GeoTagger < Base
4
4
 
5
5
  require 'mongo_helper'
6
6
  require 'map_data_extraction_util'
@@ -97,7 +97,9 @@ class GeoTagger < Base
97
97
 
98
98
  def load_geolocations_into_db
99
99
 
100
- @companies_coll.find().to_a.each do |company|
100
+ companies = @companies_coll.find()
101
+ companies_arr = companies.to_a
102
+ companies_arr.each do |company|
101
103
 
102
104
  if !company
103
105
  next
@@ -0,0 +1,48 @@
1
+ require_relative "../base"
2
+
3
+ class TechAdTagger < Base
4
+
5
+ require 'whos_using_what_search_client'
6
+
7
+
8
+ def initialize
9
+ @search_client = WhosUsingWhatSearchClient.new
10
+
11
+ @mongo_client = MongoHelper.get_mongo_connection
12
+ @companies_coll = @mongo_client['companies']
13
+
14
+
15
+ end
16
+
17
+
18
+ #iterates through array and updates company db record with technologies found from ads from their website
19
+ def tag_company_with_technologies tech_keywords
20
+
21
+ companies = @companies_coll.find(
22
+ "languages" => {"$exists" => false}
23
+ )
24
+
25
+ languages = Hash.new
26
+
27
+ companies.each do |company|
28
+
29
+ languages = Hash.new
30
+
31
+ company_languages_map = @search_client.search tech_keywords, company["websiteUrl"]
32
+
33
+ company_languages_map.each do |key, value|
34
+
35
+ languages[key] = value
36
+
37
+ end
38
+
39
+ company['languages'] = languages
40
+
41
+ @companies_coll.update({"_id" => company["_id"]}, company)
42
+
43
+ puts "updating: " << company.to_s
44
+
45
+ end
46
+ end
47
+
48
+ end
@@ -1,7 +1,5 @@
1
1
  require_relative '../base'
2
2
 
3
- include Mongo
4
-
5
3
  class MongoHelper < Base
6
4
 
7
5
  require 'mongo'
@@ -9,6 +7,8 @@ class MongoHelper < Base
9
7
  require 'json'
10
8
  require 'yaml'
11
9
 
10
+ include Mongo
11
+
12
12
  def self.get_connection
13
13
  return @db_connection if @db_connection
14
14
  db = URI.parse(ENV["mongo.uri"].strip)
@@ -0,0 +1,76 @@
1
+ class DataPopulators
2
+
3
+ private
4
+
5
+ def self.load_class_paths
6
+
7
+ $:.unshift(File.expand_path('../../data_gatherers', __FILE__))
8
+ $:.unshift(File.expand_path('../../data_searchers', __FILE__))
9
+ $:.unshift(File.expand_path('../../logging', __FILE__))
10
+
11
+ end
12
+
13
+ #must be after load_class_paths is called
14
+ def self.initial_requires
15
+
16
+ require 'geo_tagger'
17
+ require 'companies_searcher'
18
+ require 'logger_factory'
19
+ require 'gather_companies'
20
+ require 'tech_ad_tagger'
21
+
22
+ end
23
+
24
+
25
+ def self.static_initialize
26
+
27
+ #this must be called first
28
+ load_class_paths
29
+ #then this
30
+ initial_requires
31
+
32
+ #objects
33
+ @@log = LoggerFactory.get_default_logger
34
+ @@geo_tagger = GeoTagger.new @@log
35
+ @@gather_companies = GatherCompanies.new
36
+ @@companies_searcher = CompaniesSearcher.new @@geo_tagger
37
+ @@ech_ad_tagger = TechAdTagger.new
38
+
39
+ #data holders
40
+ @@facet_location = "us:82" #Sacramento
41
+ @@programming_languages = ["java", "ruby", "c#", "php", "python", "javascript"]
42
+
43
+
44
+ end
45
+
46
+ static_initialize
47
+
48
+ if __FILE__ == $PROGRAM_NAME
49
+
50
+
51
+ begin
52
+ @@geo_tagger.load_geolocations_into_db
53
+ rescue Exception => e
54
+ puts e.message
55
+ puts e.backtrace
56
+ end
57
+
58
+
59
+ end
60
+
61
+ #examples:
62
+
63
+ # self_instance.tech_ad_tagger.tag_company_with_technologies self_instance.programming_languages
64
+
65
+ # self_instance.gather_companies.load_companies_to_db 700, 0, facet_location
66
+
67
+ # self_instance.geo_tagger.load_geolocations_into_db
68
+
69
+ # self_instance.geo_tagger.update_companies_with_latitude_longitude
70
+
71
+ # near = self_instance.companies_searcher.zip_code_search "95688"
72
+
73
+ # near = self_instance.companies_searcher.geospatial_search -122.4099154, 37.8059887
74
+
75
+
76
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whos_using_what
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.12
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -116,12 +116,14 @@ files:
116
116
  - lib/whos_using_what/data_searchers/companies_searcher.rb
117
117
  - lib/whos_using_what/util/map_data_extraction_util.rb
118
118
  - lib/whos_using_what/logging/logger_factory.rb
119
+ - lib/whos_using_what/scripts/data_populators.rb
119
120
  - lib/whos_using_what/api_clients/base_api_client.rb
120
- - lib/whos_using_what/api_clients/search_client.rb
121
121
  - lib/whos_using_what/api_clients/linkedin_client.rb
122
+ - lib/whos_using_what/api_clients/whos_using_what_search_client.rb
122
123
  - lib/whos_using_what/api_clients/google_locations_client.rb
123
124
  - lib/whos_using_what/data_gatherers/gather_companies.rb
124
125
  - lib/whos_using_what/data_gatherers/geo_tagger.rb
126
+ - lib/whos_using_what/data_gatherers/tech_ad_tagger.rb
125
127
  - lib/whos_using_what/no_sql/mongo_helper.rb
126
128
  - lib/whos_using_what/base.rb
127
129
  homepage: http://rubygems.org/gems/whos_using_what