whos_using_what 0.2.12 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/whos_using_what/api_clients/{search_client.rb → whos_using_what_search_client.rb} +38 -18
- data/lib/whos_using_what/data_gatherers/gather_companies.rb +2 -2
- data/lib/whos_using_what/data_gatherers/geo_tagger.rb +4 -2
- data/lib/whos_using_what/data_gatherers/tech_ad_tagger.rb +48 -0
- data/lib/whos_using_what/no_sql/mongo_helper.rb +2 -2
- data/lib/whos_using_what/scripts/data_populators.rb +76 -0
- metadata +4 -2
@@ -1,6 +1,6 @@
|
|
1
1
|
require_relative "../base"
|
2
2
|
|
3
|
-
class
|
3
|
+
class WhosUsingWhatSearchClient < Base
|
4
4
|
|
5
5
|
require "uri"
|
6
6
|
require "rest-client"
|
@@ -22,11 +22,26 @@ class SearchClient < Base
|
|
22
22
|
|
23
23
|
end
|
24
24
|
|
25
|
+
private
|
26
|
+
|
25
27
|
def extractUrls (rawInput, mustContainUrl)
|
26
28
|
|
27
29
|
acceptedUrls = Array.new
|
28
30
|
|
29
|
-
|
31
|
+
if (rawInput == nil)
|
32
|
+
return acceptedUrls
|
33
|
+
end
|
34
|
+
|
35
|
+
urls = []
|
36
|
+
|
37
|
+
begin
|
38
|
+
urls = URI.extract(rawInput)
|
39
|
+
end
|
40
|
+
|
41
|
+
if urls.size < 1
|
42
|
+
return urls
|
43
|
+
end
|
44
|
+
|
30
45
|
urls.each do |url|
|
31
46
|
add = true
|
32
47
|
@negativeMatchUrlPatterns.each do |token|
|
@@ -75,10 +90,12 @@ class SearchClient < Base
|
|
75
90
|
|
76
91
|
end
|
77
92
|
|
93
|
+
public
|
78
94
|
|
79
|
-
def search(query, site)
|
80
95
|
|
81
|
-
|
96
|
+
#performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
|
97
|
+
#If an ad exists it is returned as part of map
|
98
|
+
def search queries, url
|
82
99
|
|
83
100
|
begin
|
84
101
|
rawHtml = RestClient.get(url)
|
@@ -86,31 +103,34 @@ class SearchClient < Base
|
|
86
103
|
|
87
104
|
end
|
88
105
|
|
89
|
-
urls = extractUrls(rawHtml,
|
106
|
+
urls = extractUrls(rawHtml, url)
|
90
107
|
|
91
|
-
|
108
|
+
matching_url = nil
|
92
109
|
|
93
|
-
|
110
|
+
ret_map = Hash.new
|
94
111
|
|
95
112
|
urls.each do |cur_url|
|
96
113
|
begin
|
97
114
|
html = RestClient.get(cur_url)
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
115
|
+
|
116
|
+
queries.each do |query|
|
117
|
+
|
118
|
+
url = "https://www.google.com/search?hl=en&as_q=" << query << "&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all&as_sitesearch=" << cur_url << "&as_occt=any&safe=off&tbs=&as_filetype=&as_rights="
|
119
|
+
|
120
|
+
uses_technology = determineIfUsesTechnology(query, html)
|
121
|
+
|
122
|
+
if (uses_technology)
|
123
|
+
ret_map[query] = cur_url
|
124
|
+
end
|
125
|
+
|
103
126
|
end
|
127
|
+
|
104
128
|
rescue Exception => exception
|
105
|
-
#
|
129
|
+
#don't really care at this point, probably not worth logging as some sites just don't end up loading properly
|
106
130
|
end
|
107
131
|
end
|
108
132
|
|
109
|
-
|
110
|
-
return false
|
111
|
-
end
|
112
|
-
|
113
|
-
return isMatch
|
133
|
+
ret_map
|
114
134
|
end
|
115
135
|
|
116
136
|
end
|
@@ -21,7 +21,7 @@ class GatherCompanies < Base
|
|
21
21
|
|
22
22
|
end
|
23
23
|
|
24
|
-
def load_companies_to_db num_iterations, cur_start_position
|
24
|
+
def load_companies_to_db num_iterations, cur_start_position, facet_location_code
|
25
25
|
|
26
26
|
increment = 20
|
27
27
|
cnt = 1
|
@@ -32,7 +32,7 @@ class GatherCompanies < Base
|
|
32
32
|
resp = @@linkedin_client.query_companies ({
|
33
33
|
"start" => cur_start_position.to_s << "&count=" << increment.to_s,
|
34
34
|
"facet=industry" => @linkedin_tech_industry_codes,
|
35
|
-
"
|
35
|
+
"facet=location"=> facet_location_code
|
36
36
|
})
|
37
37
|
docs = resp['companies'].values[3]
|
38
38
|
if docs != nil
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require_relative "../base"
|
2
2
|
|
3
|
-
class GeoTagger
|
3
|
+
class GeoTagger < Base
|
4
4
|
|
5
5
|
require 'mongo_helper'
|
6
6
|
require 'map_data_extraction_util'
|
@@ -97,7 +97,9 @@ class GeoTagger < Base
|
|
97
97
|
|
98
98
|
def load_geolocations_into_db
|
99
99
|
|
100
|
-
@companies_coll.find()
|
100
|
+
companies = @companies_coll.find()
|
101
|
+
companies_arr = companies.to_a
|
102
|
+
companies_arr.each do |company|
|
101
103
|
|
102
104
|
if !company
|
103
105
|
next
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require_relative "../base"
|
2
|
+
|
3
|
+
class TechAdTagger < Base
|
4
|
+
|
5
|
+
require 'whos_using_what_search_client'
|
6
|
+
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@search_client = WhosUsingWhatSearchClient.new
|
10
|
+
|
11
|
+
@mongo_client = MongoHelper.get_mongo_connection
|
12
|
+
@companies_coll = @mongo_client['companies']
|
13
|
+
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
#iterates through array and updates company db record with technologies found from ads from their website
|
19
|
+
def tag_company_with_technologies tech_keywords
|
20
|
+
|
21
|
+
companies = @companies_coll.find(
|
22
|
+
"languages" => {"$exists" => false}
|
23
|
+
)
|
24
|
+
|
25
|
+
languages = Hash.new
|
26
|
+
|
27
|
+
companies.each do |company|
|
28
|
+
|
29
|
+
languages = Hash.new
|
30
|
+
|
31
|
+
company_languages_map = @search_client.search tech_keywords, company["websiteUrl"]
|
32
|
+
|
33
|
+
company_languages_map.each do |key, value|
|
34
|
+
|
35
|
+
languages[key] = value
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
company['languages'] = languages
|
40
|
+
|
41
|
+
@companies_coll.update({"_id" => company["_id"]}, company)
|
42
|
+
|
43
|
+
puts "updating: " << company.to_s
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -1,7 +1,5 @@
|
|
1
1
|
require_relative '../base'
|
2
2
|
|
3
|
-
include Mongo
|
4
|
-
|
5
3
|
class MongoHelper < Base
|
6
4
|
|
7
5
|
require 'mongo'
|
@@ -9,6 +7,8 @@ class MongoHelper < Base
|
|
9
7
|
require 'json'
|
10
8
|
require 'yaml'
|
11
9
|
|
10
|
+
include Mongo
|
11
|
+
|
12
12
|
def self.get_connection
|
13
13
|
return @db_connection if @db_connection
|
14
14
|
db = URI.parse(ENV["mongo.uri"].strip)
|
@@ -0,0 +1,76 @@
|
|
1
|
+
class DataPopulators
|
2
|
+
|
3
|
+
private
|
4
|
+
|
5
|
+
def self.load_class_paths
|
6
|
+
|
7
|
+
$:.unshift(File.expand_path('../../data_gatherers', __FILE__))
|
8
|
+
$:.unshift(File.expand_path('../../data_searchers', __FILE__))
|
9
|
+
$:.unshift(File.expand_path('../../logging', __FILE__))
|
10
|
+
|
11
|
+
end
|
12
|
+
|
13
|
+
#must be after load_class_paths is called
|
14
|
+
def self.initial_requires
|
15
|
+
|
16
|
+
require 'geo_tagger'
|
17
|
+
require 'companies_searcher'
|
18
|
+
require 'logger_factory'
|
19
|
+
require 'gather_companies'
|
20
|
+
require 'tech_ad_tagger'
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
def self.static_initialize
|
26
|
+
|
27
|
+
#this must be called first
|
28
|
+
load_class_paths
|
29
|
+
#then this
|
30
|
+
initial_requires
|
31
|
+
|
32
|
+
#objects
|
33
|
+
@@log = LoggerFactory.get_default_logger
|
34
|
+
@@geo_tagger = GeoTagger.new @@log
|
35
|
+
@@gather_companies = GatherCompanies.new
|
36
|
+
@@companies_searcher = CompaniesSearcher.new @@geo_tagger
|
37
|
+
@@ech_ad_tagger = TechAdTagger.new
|
38
|
+
|
39
|
+
#data holders
|
40
|
+
@@facet_location = "us:82" #Sacramento
|
41
|
+
@@programming_languages = ["java", "ruby", "c#", "php", "python", "javascript"]
|
42
|
+
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
static_initialize
|
47
|
+
|
48
|
+
if __FILE__ == $PROGRAM_NAME
|
49
|
+
|
50
|
+
|
51
|
+
begin
|
52
|
+
@@geo_tagger.load_geolocations_into_db
|
53
|
+
rescue Exception => e
|
54
|
+
puts e.message
|
55
|
+
puts e.backtrace
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
#examples:
|
62
|
+
|
63
|
+
# self_instance.tech_ad_tagger.tag_company_with_technologies self_instance.programming_languages
|
64
|
+
|
65
|
+
# self_instance.gather_companies.load_companies_to_db 700, 0, facet_location
|
66
|
+
|
67
|
+
# self_instance.geo_tagger.load_geolocations_into_db
|
68
|
+
|
69
|
+
# self_instance.geo_tagger.update_companies_with_latitude_longitude
|
70
|
+
|
71
|
+
# near = self_instance.companies_searcher.zip_code_search "95688"
|
72
|
+
|
73
|
+
# near = self_instance.companies_searcher.geospatial_search -122.4099154, 37.8059887
|
74
|
+
|
75
|
+
|
76
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: whos_using_what
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -116,12 +116,14 @@ files:
|
|
116
116
|
- lib/whos_using_what/data_searchers/companies_searcher.rb
|
117
117
|
- lib/whos_using_what/util/map_data_extraction_util.rb
|
118
118
|
- lib/whos_using_what/logging/logger_factory.rb
|
119
|
+
- lib/whos_using_what/scripts/data_populators.rb
|
119
120
|
- lib/whos_using_what/api_clients/base_api_client.rb
|
120
|
-
- lib/whos_using_what/api_clients/search_client.rb
|
121
121
|
- lib/whos_using_what/api_clients/linkedin_client.rb
|
122
|
+
- lib/whos_using_what/api_clients/whos_using_what_search_client.rb
|
122
123
|
- lib/whos_using_what/api_clients/google_locations_client.rb
|
123
124
|
- lib/whos_using_what/data_gatherers/gather_companies.rb
|
124
125
|
- lib/whos_using_what/data_gatherers/geo_tagger.rb
|
126
|
+
- lib/whos_using_what/data_gatherers/tech_ad_tagger.rb
|
125
127
|
- lib/whos_using_what/no_sql/mongo_helper.rb
|
126
128
|
- lib/whos_using_what/base.rb
|
127
129
|
homepage: http://rubygems.org/gems/whos_using_what
|