whos_using_what 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/whos_using_what/api_clients/base_api_client.rb +54 -0
- data/lib/whos_using_what/api_clients/google_client.rb +177 -0
- data/lib/whos_using_what/base.rb +5 -1
- data/lib/whos_using_what/data_gatherers/tech_ad_tagger.rb +15 -6
- data/lib/whos_using_what/scripts/data_populators.rb +28 -13
- metadata +50 -2
- data/lib/whos_using_what/api_clients/whos_using_what_search_client.rb +0 -137
@@ -2,6 +2,60 @@ require_relative "../base"
|
|
2
2
|
|
3
3
|
class BaseApiClient < Base
|
4
4
|
|
5
|
+
require "uri"
|
6
|
+
require "rest-client"
|
7
|
+
|
8
|
+
def arraySearch(array, rawHtml)
|
9
|
+
|
10
|
+
rawHtml = rawHtml.downcase
|
11
|
+
array.each do |token|
|
12
|
+
if (rawHtml.index(token) != nil)
|
13
|
+
return true
|
14
|
+
end
|
15
|
+
end
|
16
|
+
return false
|
17
|
+
end
|
18
|
+
|
19
|
+
def arry_to_str_delim array, delim
|
20
|
+
|
21
|
+
str = ""
|
22
|
+
i = 0
|
23
|
+
array.each do |entry|
|
24
|
+
if i < 1
|
25
|
+
str = entry.strip
|
26
|
+
|
27
|
+
else
|
28
|
+
str = str << delim << entry.strip
|
29
|
+
end
|
30
|
+
i += 1
|
31
|
+
end
|
32
|
+
|
33
|
+
str.strip
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
def cleanup_url url
|
38
|
+
#clean up url
|
39
|
+
url = url.strip
|
40
|
+
if url["www."] != nil
|
41
|
+
url["www."] = ""
|
42
|
+
end
|
43
|
+
if url["site:"] != nil
|
44
|
+
url["site:"] = ""
|
45
|
+
end
|
46
|
+
url
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
def determineIfUsesTechnology(technology, rawHtml)
|
51
|
+
|
52
|
+
isJobPage = arraySearch(@jobPageTokens, rawHtml)
|
53
|
+
|
54
|
+
return isJobPage
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
|
5
59
|
def starts_with?(string, prefix)
|
6
60
|
prefix = prefix.to_s
|
7
61
|
string[0, prefix.length] == prefix
|
@@ -0,0 +1,177 @@
|
|
1
|
+
require_relative "base_api_client"
|
2
|
+
require 'mechanize'
|
3
|
+
require 'watir-webdriver'
|
4
|
+
require 'headless'
|
5
|
+
|
6
|
+
class GoogleClient < BaseApiClient
|
7
|
+
|
8
|
+
attr :results
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
|
12
|
+
@negativeMatchUrlPatterns = ['google', 'youtube', 'duckduckgo', 'bing', 'yahoo']
|
13
|
+
|
14
|
+
@positiveMatchUrlPatterns = ['http', 'www']
|
15
|
+
|
16
|
+
@technologiesToSearchFor = ['ruby', 'java', 'javascript', 'python']
|
17
|
+
|
18
|
+
@jobPageTokens = ['job', 'hiring', 'career']
|
19
|
+
|
20
|
+
@results = Hash.new
|
21
|
+
|
22
|
+
@mechanize = Mechanize.new
|
23
|
+
|
24
|
+
headless = Headless.new
|
25
|
+
headless.start
|
26
|
+
@browser = Watir::Browser.new :firefox
|
27
|
+
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
def extractUrls (rawInput, mustContainUrl)
|
33
|
+
|
34
|
+
acceptedUrls = Array.new
|
35
|
+
|
36
|
+
if (rawInput == nil)
|
37
|
+
return acceptedUrls
|
38
|
+
end
|
39
|
+
|
40
|
+
urls = []
|
41
|
+
|
42
|
+
begin
|
43
|
+
urls = URI.extract(rawInput)
|
44
|
+
end
|
45
|
+
|
46
|
+
if urls.size < 1
|
47
|
+
return acceptedUrls
|
48
|
+
end
|
49
|
+
|
50
|
+
mustContainUrl = cleanup_url mustContainUrl
|
51
|
+
|
52
|
+
urls.each do |url|
|
53
|
+
|
54
|
+
url = cleanup_url url
|
55
|
+
|
56
|
+
accept_url_bool = false
|
57
|
+
|
58
|
+
@positiveMatchUrlPatterns.each do |token|
|
59
|
+
if (starts_with? url, token) ||
|
60
|
+
(starts_with? url, mustContainUrl)
|
61
|
+
accept_url_bool = true
|
62
|
+
break
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
if !accept_url_bool
|
67
|
+
next
|
68
|
+
end
|
69
|
+
|
70
|
+
if !(url.include? mustContainUrl)
|
71
|
+
accept_url_bool = false
|
72
|
+
end
|
73
|
+
|
74
|
+
@negativeMatchUrlPatterns.each do |token|
|
75
|
+
if url.include? token
|
76
|
+
accept_url_bool = false
|
77
|
+
break
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
url = cleanup_url url
|
82
|
+
|
83
|
+
if accept_url_bool &&
|
84
|
+
url != nil && !(acceptedUrls.include? url)
|
85
|
+
acceptedUrls.push url
|
86
|
+
end
|
87
|
+
end
|
88
|
+
acceptedUrls
|
89
|
+
end
|
90
|
+
|
91
|
+
def generate_google_url site_url, search_keyword
|
92
|
+
query_url = [
|
93
|
+
"http://www.google.com/search?",
|
94
|
+
"hl=en",
|
95
|
+
"&as_q=" << "hiring+" << search_keyword,
|
96
|
+
"&as_sitesearch=" << (cleanup_url (site_url))
|
97
|
+
]
|
98
|
+
|
99
|
+
url = arry_to_str_delim query_url, ""
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
def generate_duckduckgo_url site_url, search_keyword
|
104
|
+
|
105
|
+
query_url = [
|
106
|
+
"http://duckduckgo.com/?",
|
107
|
+
"q=" <<
|
108
|
+
"site:" << (cleanup_url (site_url)) <<
|
109
|
+
"+hiring+" << search_keyword,
|
110
|
+
]
|
111
|
+
|
112
|
+
url = arry_to_str_delim query_url, ""
|
113
|
+
end
|
114
|
+
|
115
|
+
#performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
|
116
|
+
#If an ad exists it is returned as part of map
|
117
|
+
def google_search queries, site_url
|
118
|
+
|
119
|
+
ret_map = Hash.new
|
120
|
+
|
121
|
+
@technologiesToSearchFor.each do |search_keyword|
|
122
|
+
|
123
|
+
url = ""
|
124
|
+
raw_html = ""
|
125
|
+
|
126
|
+
begin
|
127
|
+
|
128
|
+
url = generate_duckduckgo_url site_url, search_keyword
|
129
|
+
|
130
|
+
#perform initial search engine search
|
131
|
+
@browser.goto url
|
132
|
+
raw_html = @browser.html
|
133
|
+
|
134
|
+
puts "successfully queried url:" << url
|
135
|
+
|
136
|
+
rescue Exception => e
|
137
|
+
puts "exception:" << e.message << " when querying url: " << url
|
138
|
+
end
|
139
|
+
|
140
|
+
urls = extractUrls(raw_html, site_url)
|
141
|
+
|
142
|
+
urls.each do |cur_url|
|
143
|
+
|
144
|
+
begin
|
145
|
+
|
146
|
+
@browser.goto cur_url
|
147
|
+
html = @browser.html
|
148
|
+
|
149
|
+
#strip all html tags, for human readability and to cut down on some errors that could arise
|
150
|
+
# TODO this was causing an exception
|
151
|
+
# html = html.gsub!(/(<[^>]*>)|\n|\t/s) { " " }
|
152
|
+
|
153
|
+
uses_technology = determineIfUsesTechnology(search_keyword, html)
|
154
|
+
|
155
|
+
if (uses_technology)
|
156
|
+
ret_map[search_keyword] = cur_url
|
157
|
+
end
|
158
|
+
|
159
|
+
rescue Exception => e
|
160
|
+
|
161
|
+
puts e.message
|
162
|
+
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
167
|
+
|
168
|
+
#throttle queries to avoid being black-listed by search engine
|
169
|
+
sleep_seconds = rand(1-5)
|
170
|
+
sleep sleep_seconds
|
171
|
+
|
172
|
+
ret_map
|
173
|
+
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
data/lib/whos_using_what/base.rb
CHANGED
@@ -1,5 +1,9 @@
|
|
1
1
|
class Base
|
2
2
|
|
3
|
+
attr :set_paths
|
4
|
+
|
5
|
+
@@paths_set = false
|
6
|
+
|
3
7
|
def self.set_paths
|
4
8
|
$:.unshift(File.expand_path('../data_gatherers', __FILE__))
|
5
9
|
$:.unshift(File.expand_path('../data_searchers', __FILE__))
|
@@ -8,9 +12,9 @@ class Base
|
|
8
12
|
$:.unshift(File.expand_path('../util', __FILE__))
|
9
13
|
$:.unshift(File.expand_path('../logging', __FILE__))
|
10
14
|
|
15
|
+
@@paths_set = true
|
11
16
|
end
|
12
17
|
|
13
18
|
set_paths
|
14
19
|
|
15
|
-
|
16
20
|
end
|
@@ -2,11 +2,11 @@ require_relative "../base"
|
|
2
2
|
|
3
3
|
class TechAdTagger < Base
|
4
4
|
|
5
|
-
|
5
|
+
def initialize
|
6
6
|
|
7
|
+
require_relative '../api_clients/google_client'
|
7
8
|
|
8
|
-
|
9
|
-
@search_client = WhosUsingWhatSearchClient.new
|
9
|
+
@search_client = GoogleClient.new
|
10
10
|
|
11
11
|
@mongo_client = MongoHelper.get_mongo_connection
|
12
12
|
@companies_coll = @mongo_client['companies']
|
@@ -18,9 +18,18 @@ class TechAdTagger < Base
|
|
18
18
|
#iterates through array and updates company db record with technologies found from ads from their website
|
19
19
|
def tag_company_with_technologies tech_keywords
|
20
20
|
|
21
|
+
# uncomment if need to clear out all existing technologies
|
22
|
+
=begin
|
23
|
+
@companies_coll.find().each do |company|
|
24
|
+
company['languages'] = {}
|
25
|
+
@companies_coll.update({"_id" => company["_id"]}, company)
|
26
|
+
end
|
27
|
+
=end
|
28
|
+
|
21
29
|
companies = @companies_coll.find(
|
22
|
-
"languages" => {"$exists" => false}
|
23
|
-
|
30
|
+
# "languages" => {"$exists" => false}
|
31
|
+
# "languages" => {}
|
32
|
+
).to_a
|
24
33
|
|
25
34
|
languages = Hash.new
|
26
35
|
|
@@ -28,7 +37,7 @@ class TechAdTagger < Base
|
|
28
37
|
|
29
38
|
languages = Hash.new
|
30
39
|
|
31
|
-
company_languages_map = @search_client.
|
40
|
+
company_languages_map = @search_client.google_search tech_keywords, company["websiteUrl"]
|
32
41
|
|
33
42
|
company_languages_map.each do |key, value|
|
34
43
|
|
@@ -34,10 +34,10 @@ class DataPopulators
|
|
34
34
|
@@geo_tagger = GeoTagger.new @@log
|
35
35
|
@@gather_companies = GatherCompanies.new
|
36
36
|
@@companies_searcher = CompaniesSearcher.new @@geo_tagger
|
37
|
-
@@
|
37
|
+
@@tech_ad_tagger = TechAdTagger.new
|
38
38
|
|
39
39
|
#data holders
|
40
|
-
@@facet_location = "us:
|
40
|
+
@@facet_location = "us:84"
|
41
41
|
@@programming_languages = ["java", "ruby", "c#", "php", "python", "javascript"]
|
42
42
|
|
43
43
|
|
@@ -48,19 +48,34 @@ class DataPopulators
|
|
48
48
|
if __FILE__ == $PROGRAM_NAME
|
49
49
|
|
50
50
|
|
51
|
-
|
51
|
+
t1 = Thread.new do
|
52
52
|
|
53
|
-
|
53
|
+
begin
|
54
54
|
|
55
|
-
|
55
|
+
# @@gather_companies.load_companies_to_db 700, 0, @@facet_location
|
56
56
|
|
57
|
+
rescue Exception => e
|
58
|
+
puts e.message
|
59
|
+
puts e.backtrace
|
57
60
|
end
|
58
61
|
|
59
|
-
rescue Exception => e
|
60
|
-
puts e.message
|
61
|
-
puts e.backtrace
|
62
62
|
end
|
63
63
|
|
64
|
+
|
65
|
+
t2 = Thread.new do
|
66
|
+
|
67
|
+
begin
|
68
|
+
|
69
|
+
@@tech_ad_tagger.tag_company_with_technologies @@programming_languages
|
70
|
+
|
71
|
+
rescue Exception => e
|
72
|
+
puts e.message
|
73
|
+
puts e.backtrace
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
|
64
79
|
#this is necessary, for some reason, otherwise the process just gets killed, as the sub-threads are dependent on the main thread remaining alive
|
65
80
|
while true
|
66
81
|
sleep(5)
|
@@ -71,15 +86,15 @@ class DataPopulators
|
|
71
86
|
|
72
87
|
#examples:
|
73
88
|
|
74
|
-
#
|
89
|
+
# @@tech_ad_tagger.tag_company_with_technologies @@programming_languages
|
75
90
|
|
76
|
-
#
|
91
|
+
# @@gather_companies.load_companies_to_db 700, 0, @@facet_location
|
77
92
|
|
78
|
-
#
|
93
|
+
# @@geo_tagger.load_geolocations_into_db
|
79
94
|
|
80
|
-
#
|
95
|
+
# @@geo_tagger.update_companies_with_latitude_longitude
|
81
96
|
|
82
|
-
# near =
|
97
|
+
# near = @@companies_searcher.zip_code_search "95688"
|
83
98
|
|
84
99
|
# near = self_instance.companies_searcher.geospatial_search -122.4099154, 37.8059887
|
85
100
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: whos_using_what
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -91,6 +91,22 @@ dependencies:
|
|
91
91
|
- - ! '>='
|
92
92
|
- !ruby/object:Gem::Version
|
93
93
|
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: mechanize
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
94
110
|
- !ruby/object:Gem::Dependency
|
95
111
|
name: mongo
|
96
112
|
requirement: !ruby/object:Gem::Requirement
|
@@ -107,6 +123,38 @@ dependencies:
|
|
107
123
|
- - ! '>='
|
108
124
|
- !ruby/object:Gem::Version
|
109
125
|
version: '0'
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: watir-webdriver
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ! '>='
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
type: :runtime
|
135
|
+
prerelease: false
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
142
|
+
- !ruby/object:Gem::Dependency
|
143
|
+
name: headless
|
144
|
+
requirement: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
146
|
+
requirements:
|
147
|
+
- - ! '>='
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '0'
|
150
|
+
type: :runtime
|
151
|
+
prerelease: false
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ! '>='
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
110
158
|
description: What companies are using what technologies
|
111
159
|
email: r.dane1010@gmail.com
|
112
160
|
executables: []
|
@@ -117,9 +165,9 @@ files:
|
|
117
165
|
- lib/whos_using_what/util/map_data_extraction_util.rb
|
118
166
|
- lib/whos_using_what/logging/logger_factory.rb
|
119
167
|
- lib/whos_using_what/scripts/data_populators.rb
|
168
|
+
- lib/whos_using_what/api_clients/google_client.rb
|
120
169
|
- lib/whos_using_what/api_clients/base_api_client.rb
|
121
170
|
- lib/whos_using_what/api_clients/linkedin_client.rb
|
122
|
-
- lib/whos_using_what/api_clients/whos_using_what_search_client.rb
|
123
171
|
- lib/whos_using_what/api_clients/google_locations_client.rb
|
124
172
|
- lib/whos_using_what/data_gatherers/gather_companies.rb
|
125
173
|
- lib/whos_using_what/data_gatherers/geo_tagger.rb
|
@@ -1,137 +0,0 @@
|
|
1
|
-
require_relative "../base"
|
2
|
-
|
3
|
-
class WhosUsingWhatSearchClient < Base
|
4
|
-
|
5
|
-
require "uri"
|
6
|
-
require "rest-client"
|
7
|
-
|
8
|
-
attr :results
|
9
|
-
|
10
|
-
def initialize()
|
11
|
-
|
12
|
-
|
13
|
-
@negativeMatchUrlPatterns = Array.new.push("google.com").push("youtube.com")
|
14
|
-
|
15
|
-
@positiveMatchUrlPatterns = Array.new.push("http")
|
16
|
-
|
17
|
-
@technologiesToSearchFor = Array.new.push("ruby").push("java").push("javascript").push("python").push("c++").push("c#")
|
18
|
-
|
19
|
-
@jobPageTokens = Array.new.push("job", "hiring", "career")
|
20
|
-
|
21
|
-
@results = Hash.new
|
22
|
-
|
23
|
-
end
|
24
|
-
|
25
|
-
private
|
26
|
-
|
27
|
-
def extractUrls (rawInput, mustContainUrl)
|
28
|
-
|
29
|
-
acceptedUrls = Array.new
|
30
|
-
|
31
|
-
if (rawInput == nil)
|
32
|
-
return acceptedUrls
|
33
|
-
end
|
34
|
-
|
35
|
-
urls = []
|
36
|
-
|
37
|
-
begin
|
38
|
-
urls = URI.extract(rawInput)
|
39
|
-
end
|
40
|
-
|
41
|
-
if urls.size < 1
|
42
|
-
return urls
|
43
|
-
end
|
44
|
-
|
45
|
-
urls.each do |url|
|
46
|
-
add = true
|
47
|
-
@negativeMatchUrlPatterns.each do |token|
|
48
|
-
|
49
|
-
if (nil != url.index(token))
|
50
|
-
add = false
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
@positiveMatchUrlPatterns.each do |token|
|
55
|
-
|
56
|
-
if (nil == url.index(token) || url.index(token) > 0)
|
57
|
-
add = false
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
if (mustContainUrl != nil && url.index(mustContainUrl) == nil)
|
62
|
-
add = false
|
63
|
-
end
|
64
|
-
|
65
|
-
if (add)
|
66
|
-
acceptedUrls.push(url)
|
67
|
-
end
|
68
|
-
end
|
69
|
-
acceptedUrls
|
70
|
-
end
|
71
|
-
|
72
|
-
|
73
|
-
def arraySearch(array, rawHtml)
|
74
|
-
|
75
|
-
rawHtml = rawHtml.downcase
|
76
|
-
array.each do |token|
|
77
|
-
if (rawHtml.index(token) != nil)
|
78
|
-
return true
|
79
|
-
end
|
80
|
-
end
|
81
|
-
return false
|
82
|
-
end
|
83
|
-
|
84
|
-
|
85
|
-
def determineIfUsesTechnology(technology, rawHtml)
|
86
|
-
|
87
|
-
isJobPage = arraySearch(@jobPageTokens, rawHtml)
|
88
|
-
|
89
|
-
return isJobPage
|
90
|
-
|
91
|
-
end
|
92
|
-
|
93
|
-
public
|
94
|
-
|
95
|
-
|
96
|
-
#performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
|
97
|
-
#If an ad exists it is returned as part of map
|
98
|
-
def search queries, url
|
99
|
-
|
100
|
-
begin
|
101
|
-
rawHtml = RestClient.get(url)
|
102
|
-
rescue
|
103
|
-
|
104
|
-
end
|
105
|
-
|
106
|
-
urls = extractUrls(rawHtml, url)
|
107
|
-
|
108
|
-
matching_url = nil
|
109
|
-
|
110
|
-
ret_map = Hash.new
|
111
|
-
|
112
|
-
urls.each do |cur_url|
|
113
|
-
begin
|
114
|
-
html = RestClient.get(cur_url)
|
115
|
-
|
116
|
-
queries.each do |query|
|
117
|
-
|
118
|
-
url = "https://www.google.com/search?hl=en&as_q=" << query << "&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all&as_sitesearch=" << cur_url << "&as_occt=any&safe=off&tbs=&as_filetype=&as_rights="
|
119
|
-
|
120
|
-
uses_technology = determineIfUsesTechnology(query, html)
|
121
|
-
|
122
|
-
if (uses_technology)
|
123
|
-
ret_map[query] = cur_url
|
124
|
-
end
|
125
|
-
|
126
|
-
end
|
127
|
-
|
128
|
-
rescue Exception => exception
|
129
|
-
#don't really care at this point, probably not worth logging as some sites just don't end up loading properly
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
ret_map
|
134
|
-
end
|
135
|
-
|
136
|
-
end
|
137
|
-
|