whos_using_what 0.2.12 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/whos_using_what/api_clients/{search_client.rb → whos_using_what_search_client.rb} +38 -18
- data/lib/whos_using_what/data_gatherers/gather_companies.rb +2 -2
- data/lib/whos_using_what/data_gatherers/geo_tagger.rb +4 -2
- data/lib/whos_using_what/data_gatherers/tech_ad_tagger.rb +48 -0
- data/lib/whos_using_what/no_sql/mongo_helper.rb +2 -2
- data/lib/whos_using_what/scripts/data_populators.rb +76 -0
- metadata +4 -2
@@ -1,6 +1,6 @@
|
|
1
1
|
require_relative "../base"
|
2
2
|
|
3
|
-
class
|
3
|
+
class WhosUsingWhatSearchClient < Base
|
4
4
|
|
5
5
|
require "uri"
|
6
6
|
require "rest-client"
|
@@ -22,11 +22,26 @@ class SearchClient < Base
|
|
22
22
|
|
23
23
|
end
|
24
24
|
|
25
|
+
private
|
26
|
+
|
25
27
|
def extractUrls (rawInput, mustContainUrl)
|
26
28
|
|
27
29
|
acceptedUrls = Array.new
|
28
30
|
|
29
|
-
|
31
|
+
if (rawInput == nil)
|
32
|
+
return acceptedUrls
|
33
|
+
end
|
34
|
+
|
35
|
+
urls = []
|
36
|
+
|
37
|
+
begin
|
38
|
+
urls = URI.extract(rawInput)
|
39
|
+
end
|
40
|
+
|
41
|
+
if urls.size < 1
|
42
|
+
return urls
|
43
|
+
end
|
44
|
+
|
30
45
|
urls.each do |url|
|
31
46
|
add = true
|
32
47
|
@negativeMatchUrlPatterns.each do |token|
|
@@ -75,10 +90,12 @@ class SearchClient < Base
|
|
75
90
|
|
76
91
|
end
|
77
92
|
|
93
|
+
public
|
78
94
|
|
79
|
-
def search(query, site)
|
80
95
|
|
81
|
-
|
96
|
+
#performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
|
97
|
+
#If an ad exists it is returned as part of map
|
98
|
+
def search queries, url
|
82
99
|
|
83
100
|
begin
|
84
101
|
rawHtml = RestClient.get(url)
|
@@ -86,31 +103,34 @@ class SearchClient < Base
|
|
86
103
|
|
87
104
|
end
|
88
105
|
|
89
|
-
urls = extractUrls(rawHtml,
|
106
|
+
urls = extractUrls(rawHtml, url)
|
90
107
|
|
91
|
-
|
108
|
+
matching_url = nil
|
92
109
|
|
93
|
-
|
110
|
+
ret_map = Hash.new
|
94
111
|
|
95
112
|
urls.each do |cur_url|
|
96
113
|
begin
|
97
114
|
html = RestClient.get(cur_url)
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
115
|
+
|
116
|
+
queries.each do |query|
|
117
|
+
|
118
|
+
url = "https://www.google.com/search?hl=en&as_q=" << query << "&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all&as_sitesearch=" << cur_url << "&as_occt=any&safe=off&tbs=&as_filetype=&as_rights="
|
119
|
+
|
120
|
+
uses_technology = determineIfUsesTechnology(query, html)
|
121
|
+
|
122
|
+
if (uses_technology)
|
123
|
+
ret_map[query] = cur_url
|
124
|
+
end
|
125
|
+
|
103
126
|
end
|
127
|
+
|
104
128
|
rescue Exception => exception
|
105
|
-
#
|
129
|
+
#don't really care at this point, probably not worth logging as some sites just don't end up loading properly
|
106
130
|
end
|
107
131
|
end
|
108
132
|
|
109
|
-
|
110
|
-
return false
|
111
|
-
end
|
112
|
-
|
113
|
-
return isMatch
|
133
|
+
ret_map
|
114
134
|
end
|
115
135
|
|
116
136
|
end
|
@@ -21,7 +21,7 @@ class GatherCompanies < Base
|
|
21
21
|
|
22
22
|
end
|
23
23
|
|
24
|
-
def load_companies_to_db num_iterations, cur_start_position
|
24
|
+
def load_companies_to_db num_iterations, cur_start_position, facet_location_code
|
25
25
|
|
26
26
|
increment = 20
|
27
27
|
cnt = 1
|
@@ -32,7 +32,7 @@ class GatherCompanies < Base
|
|
32
32
|
resp = @@linkedin_client.query_companies ({
|
33
33
|
"start" => cur_start_position.to_s << "&count=" << increment.to_s,
|
34
34
|
"facet=industry" => @linkedin_tech_industry_codes,
|
35
|
-
"
|
35
|
+
"facet=location"=> facet_location_code
|
36
36
|
})
|
37
37
|
docs = resp['companies'].values[3]
|
38
38
|
if docs != nil
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require_relative "../base"
|
2
2
|
|
3
|
-
class GeoTagger
|
3
|
+
class GeoTagger < Base
|
4
4
|
|
5
5
|
require 'mongo_helper'
|
6
6
|
require 'map_data_extraction_util'
|
@@ -97,7 +97,9 @@ class GeoTagger < Base
|
|
97
97
|
|
98
98
|
def load_geolocations_into_db
|
99
99
|
|
100
|
-
@companies_coll.find()
|
100
|
+
companies = @companies_coll.find()
|
101
|
+
companies_arr = companies.to_a
|
102
|
+
companies_arr.each do |company|
|
101
103
|
|
102
104
|
if !company
|
103
105
|
next
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require_relative "../base"
|
2
|
+
|
3
|
+
class TechAdTagger < Base
|
4
|
+
|
5
|
+
require 'whos_using_what_search_client'
|
6
|
+
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@search_client = WhosUsingWhatSearchClient.new
|
10
|
+
|
11
|
+
@mongo_client = MongoHelper.get_mongo_connection
|
12
|
+
@companies_coll = @mongo_client['companies']
|
13
|
+
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
#iterates through array and updates company db record with technologies found from ads from their website
|
19
|
+
def tag_company_with_technologies tech_keywords
|
20
|
+
|
21
|
+
companies = @companies_coll.find(
|
22
|
+
"languages" => {"$exists" => false}
|
23
|
+
)
|
24
|
+
|
25
|
+
languages = Hash.new
|
26
|
+
|
27
|
+
companies.each do |company|
|
28
|
+
|
29
|
+
languages = Hash.new
|
30
|
+
|
31
|
+
company_languages_map = @search_client.search tech_keywords, company["websiteUrl"]
|
32
|
+
|
33
|
+
company_languages_map.each do |key, value|
|
34
|
+
|
35
|
+
languages[key] = value
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
company['languages'] = languages
|
40
|
+
|
41
|
+
@companies_coll.update({"_id" => company["_id"]}, company)
|
42
|
+
|
43
|
+
puts "updating: " << company.to_s
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -1,7 +1,5 @@
|
|
1
1
|
require_relative '../base'
|
2
2
|
|
3
|
-
include Mongo
|
4
|
-
|
5
3
|
class MongoHelper < Base
|
6
4
|
|
7
5
|
require 'mongo'
|
@@ -9,6 +7,8 @@ class MongoHelper < Base
|
|
9
7
|
require 'json'
|
10
8
|
require 'yaml'
|
11
9
|
|
10
|
+
include Mongo
|
11
|
+
|
12
12
|
def self.get_connection
|
13
13
|
return @db_connection if @db_connection
|
14
14
|
db = URI.parse(ENV["mongo.uri"].strip)
|
@@ -0,0 +1,76 @@
|
|
1
|
+
class DataPopulators
|
2
|
+
|
3
|
+
private
|
4
|
+
|
5
|
+
def self.load_class_paths
|
6
|
+
|
7
|
+
$:.unshift(File.expand_path('../../data_gatherers', __FILE__))
|
8
|
+
$:.unshift(File.expand_path('../../data_searchers', __FILE__))
|
9
|
+
$:.unshift(File.expand_path('../../logging', __FILE__))
|
10
|
+
|
11
|
+
end
|
12
|
+
|
13
|
+
#must be after load_class_paths is called
|
14
|
+
def self.initial_requires
|
15
|
+
|
16
|
+
require 'geo_tagger'
|
17
|
+
require 'companies_searcher'
|
18
|
+
require 'logger_factory'
|
19
|
+
require 'gather_companies'
|
20
|
+
require 'tech_ad_tagger'
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
def self.static_initialize
|
26
|
+
|
27
|
+
#this must be called first
|
28
|
+
load_class_paths
|
29
|
+
#then this
|
30
|
+
initial_requires
|
31
|
+
|
32
|
+
#objects
|
33
|
+
@@log = LoggerFactory.get_default_logger
|
34
|
+
@@geo_tagger = GeoTagger.new @@log
|
35
|
+
@@gather_companies = GatherCompanies.new
|
36
|
+
@@companies_searcher = CompaniesSearcher.new @@geo_tagger
|
37
|
+
@@ech_ad_tagger = TechAdTagger.new
|
38
|
+
|
39
|
+
#data holders
|
40
|
+
@@facet_location = "us:82" #Sacramento
|
41
|
+
@@programming_languages = ["java", "ruby", "c#", "php", "python", "javascript"]
|
42
|
+
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
static_initialize
|
47
|
+
|
48
|
+
if __FILE__ == $PROGRAM_NAME
|
49
|
+
|
50
|
+
|
51
|
+
begin
|
52
|
+
@@geo_tagger.load_geolocations_into_db
|
53
|
+
rescue Exception => e
|
54
|
+
puts e.message
|
55
|
+
puts e.backtrace
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
#examples:
|
62
|
+
|
63
|
+
# self_instance.tech_ad_tagger.tag_company_with_technologies self_instance.programming_languages
|
64
|
+
|
65
|
+
# self_instance.gather_companies.load_companies_to_db 700, 0, facet_location
|
66
|
+
|
67
|
+
# self_instance.geo_tagger.load_geolocations_into_db
|
68
|
+
|
69
|
+
# self_instance.geo_tagger.update_companies_with_latitude_longitude
|
70
|
+
|
71
|
+
# near = self_instance.companies_searcher.zip_code_search "95688"
|
72
|
+
|
73
|
+
# near = self_instance.companies_searcher.geospatial_search -122.4099154, 37.8059887
|
74
|
+
|
75
|
+
|
76
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: whos_using_what
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -116,12 +116,14 @@ files:
|
|
116
116
|
- lib/whos_using_what/data_searchers/companies_searcher.rb
|
117
117
|
- lib/whos_using_what/util/map_data_extraction_util.rb
|
118
118
|
- lib/whos_using_what/logging/logger_factory.rb
|
119
|
+
- lib/whos_using_what/scripts/data_populators.rb
|
119
120
|
- lib/whos_using_what/api_clients/base_api_client.rb
|
120
|
-
- lib/whos_using_what/api_clients/search_client.rb
|
121
121
|
- lib/whos_using_what/api_clients/linkedin_client.rb
|
122
|
+
- lib/whos_using_what/api_clients/whos_using_what_search_client.rb
|
122
123
|
- lib/whos_using_what/api_clients/google_locations_client.rb
|
123
124
|
- lib/whos_using_what/data_gatherers/gather_companies.rb
|
124
125
|
- lib/whos_using_what/data_gatherers/geo_tagger.rb
|
126
|
+
- lib/whos_using_what/data_gatherers/tech_ad_tagger.rb
|
125
127
|
- lib/whos_using_what/no_sql/mongo_helper.rb
|
126
128
|
- lib/whos_using_what/base.rb
|
127
129
|
homepage: http://rubygems.org/gems/whos_using_what
|