whos_using_what 0.3.1 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/whos_using_what/api_clients/base_api_client.rb +54 -0
- data/lib/whos_using_what/api_clients/google_client.rb +177 -0
- data/lib/whos_using_what/base.rb +5 -1
- data/lib/whos_using_what/data_gatherers/tech_ad_tagger.rb +15 -6
- data/lib/whos_using_what/scripts/data_populators.rb +28 -13
- metadata +50 -2
- data/lib/whos_using_what/api_clients/whos_using_what_search_client.rb +0 -137
@@ -2,6 +2,60 @@ require_relative "../base"
|
|
2
2
|
|
3
3
|
class BaseApiClient < Base
|
4
4
|
|
5
|
+
require "uri"
|
6
|
+
require "rest-client"
|
7
|
+
|
8
|
+
def arraySearch(array, rawHtml)
|
9
|
+
|
10
|
+
rawHtml = rawHtml.downcase
|
11
|
+
array.each do |token|
|
12
|
+
if (rawHtml.index(token) != nil)
|
13
|
+
return true
|
14
|
+
end
|
15
|
+
end
|
16
|
+
return false
|
17
|
+
end
|
18
|
+
|
19
|
+
def arry_to_str_delim array, delim
|
20
|
+
|
21
|
+
str = ""
|
22
|
+
i = 0
|
23
|
+
array.each do |entry|
|
24
|
+
if i < 1
|
25
|
+
str = entry.strip
|
26
|
+
|
27
|
+
else
|
28
|
+
str = str << delim << entry.strip
|
29
|
+
end
|
30
|
+
i += 1
|
31
|
+
end
|
32
|
+
|
33
|
+
str.strip
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
def cleanup_url url
|
38
|
+
#clean up url
|
39
|
+
url = url.strip
|
40
|
+
if url["www."] != nil
|
41
|
+
url["www."] = ""
|
42
|
+
end
|
43
|
+
if url["site:"] != nil
|
44
|
+
url["site:"] = ""
|
45
|
+
end
|
46
|
+
url
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
def determineIfUsesTechnology(technology, rawHtml)
|
51
|
+
|
52
|
+
isJobPage = arraySearch(@jobPageTokens, rawHtml)
|
53
|
+
|
54
|
+
return isJobPage
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
|
5
59
|
def starts_with?(string, prefix)
|
6
60
|
prefix = prefix.to_s
|
7
61
|
string[0, prefix.length] == prefix
|
@@ -0,0 +1,177 @@
|
|
1
|
+
require_relative "base_api_client"
|
2
|
+
require 'mechanize'
|
3
|
+
require 'watir-webdriver'
|
4
|
+
require 'headless'
|
5
|
+
|
6
|
+
class GoogleClient < BaseApiClient
|
7
|
+
|
8
|
+
attr :results
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
|
12
|
+
@negativeMatchUrlPatterns = ['google', 'youtube', 'duckduckgo', 'bing', 'yahoo']
|
13
|
+
|
14
|
+
@positiveMatchUrlPatterns = ['http', 'www']
|
15
|
+
|
16
|
+
@technologiesToSearchFor = ['ruby', 'java', 'javascript', 'python']
|
17
|
+
|
18
|
+
@jobPageTokens = ['job', 'hiring', 'career']
|
19
|
+
|
20
|
+
@results = Hash.new
|
21
|
+
|
22
|
+
@mechanize = Mechanize.new
|
23
|
+
|
24
|
+
headless = Headless.new
|
25
|
+
headless.start
|
26
|
+
@browser = Watir::Browser.new :firefox
|
27
|
+
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
def extractUrls (rawInput, mustContainUrl)
|
33
|
+
|
34
|
+
acceptedUrls = Array.new
|
35
|
+
|
36
|
+
if (rawInput == nil)
|
37
|
+
return acceptedUrls
|
38
|
+
end
|
39
|
+
|
40
|
+
urls = []
|
41
|
+
|
42
|
+
begin
|
43
|
+
urls = URI.extract(rawInput)
|
44
|
+
end
|
45
|
+
|
46
|
+
if urls.size < 1
|
47
|
+
return acceptedUrls
|
48
|
+
end
|
49
|
+
|
50
|
+
mustContainUrl = cleanup_url mustContainUrl
|
51
|
+
|
52
|
+
urls.each do |url|
|
53
|
+
|
54
|
+
url = cleanup_url url
|
55
|
+
|
56
|
+
accept_url_bool = false
|
57
|
+
|
58
|
+
@positiveMatchUrlPatterns.each do |token|
|
59
|
+
if (starts_with? url, token) ||
|
60
|
+
(starts_with? url, mustContainUrl)
|
61
|
+
accept_url_bool = true
|
62
|
+
break
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
if !accept_url_bool
|
67
|
+
next
|
68
|
+
end
|
69
|
+
|
70
|
+
if !(url.include? mustContainUrl)
|
71
|
+
accept_url_bool = false
|
72
|
+
end
|
73
|
+
|
74
|
+
@negativeMatchUrlPatterns.each do |token|
|
75
|
+
if url.include? token
|
76
|
+
accept_url_bool = false
|
77
|
+
break
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
url = cleanup_url url
|
82
|
+
|
83
|
+
if accept_url_bool &&
|
84
|
+
url != nil && !(acceptedUrls.include? url)
|
85
|
+
acceptedUrls.push url
|
86
|
+
end
|
87
|
+
end
|
88
|
+
acceptedUrls
|
89
|
+
end
|
90
|
+
|
91
|
+
def generate_google_url site_url, search_keyword
|
92
|
+
query_url = [
|
93
|
+
"http://www.google.com/search?",
|
94
|
+
"hl=en",
|
95
|
+
"&as_q=" << "hiring+" << search_keyword,
|
96
|
+
"&as_sitesearch=" << (cleanup_url (site_url))
|
97
|
+
]
|
98
|
+
|
99
|
+
url = arry_to_str_delim query_url, ""
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
def generate_duckduckgo_url site_url, search_keyword
|
104
|
+
|
105
|
+
query_url = [
|
106
|
+
"http://duckduckgo.com/?",
|
107
|
+
"q=" <<
|
108
|
+
"site:" << (cleanup_url (site_url)) <<
|
109
|
+
"+hiring+" << search_keyword,
|
110
|
+
]
|
111
|
+
|
112
|
+
url = arry_to_str_delim query_url, ""
|
113
|
+
end
|
114
|
+
|
115
|
+
#performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
|
116
|
+
#If an ad exists it is returned as part of map
|
117
|
+
def google_search queries, site_url
|
118
|
+
|
119
|
+
ret_map = Hash.new
|
120
|
+
|
121
|
+
@technologiesToSearchFor.each do |search_keyword|
|
122
|
+
|
123
|
+
url = ""
|
124
|
+
raw_html = ""
|
125
|
+
|
126
|
+
begin
|
127
|
+
|
128
|
+
url = generate_duckduckgo_url site_url, search_keyword
|
129
|
+
|
130
|
+
#perform initial search engine search
|
131
|
+
@browser.goto url
|
132
|
+
raw_html = @browser.html
|
133
|
+
|
134
|
+
puts "successfully queried url:" << url
|
135
|
+
|
136
|
+
rescue Exception => e
|
137
|
+
puts "exception:" << e.message << " when querying url: " << url
|
138
|
+
end
|
139
|
+
|
140
|
+
urls = extractUrls(raw_html, site_url)
|
141
|
+
|
142
|
+
urls.each do |cur_url|
|
143
|
+
|
144
|
+
begin
|
145
|
+
|
146
|
+
@browser.goto cur_url
|
147
|
+
html = @browser.html
|
148
|
+
|
149
|
+
#strip all html tags, for human readability and to cut down on some errors that could arise
|
150
|
+
# TODO this was causing an exception
|
151
|
+
# html = html.gsub!(/(<[^>]*>)|\n|\t/s) { " " }
|
152
|
+
|
153
|
+
uses_technology = determineIfUsesTechnology(search_keyword, html)
|
154
|
+
|
155
|
+
if (uses_technology)
|
156
|
+
ret_map[search_keyword] = cur_url
|
157
|
+
end
|
158
|
+
|
159
|
+
rescue Exception => e
|
160
|
+
|
161
|
+
puts e.message
|
162
|
+
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
167
|
+
|
168
|
+
#throttle queries to avoid being black-listed by search engine
|
169
|
+
sleep_seconds = rand(1-5)
|
170
|
+
sleep sleep_seconds
|
171
|
+
|
172
|
+
ret_map
|
173
|
+
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
data/lib/whos_using_what/base.rb
CHANGED
@@ -1,5 +1,9 @@
|
|
1
1
|
class Base
|
2
2
|
|
3
|
+
attr :set_paths
|
4
|
+
|
5
|
+
@@paths_set = false
|
6
|
+
|
3
7
|
def self.set_paths
|
4
8
|
$:.unshift(File.expand_path('../data_gatherers', __FILE__))
|
5
9
|
$:.unshift(File.expand_path('../data_searchers', __FILE__))
|
@@ -8,9 +12,9 @@ class Base
|
|
8
12
|
$:.unshift(File.expand_path('../util', __FILE__))
|
9
13
|
$:.unshift(File.expand_path('../logging', __FILE__))
|
10
14
|
|
15
|
+
@@paths_set = true
|
11
16
|
end
|
12
17
|
|
13
18
|
set_paths
|
14
19
|
|
15
|
-
|
16
20
|
end
|
@@ -2,11 +2,11 @@ require_relative "../base"
|
|
2
2
|
|
3
3
|
class TechAdTagger < Base
|
4
4
|
|
5
|
-
|
5
|
+
def initialize
|
6
6
|
|
7
|
+
require_relative '../api_clients/google_client'
|
7
8
|
|
8
|
-
|
9
|
-
@search_client = WhosUsingWhatSearchClient.new
|
9
|
+
@search_client = GoogleClient.new
|
10
10
|
|
11
11
|
@mongo_client = MongoHelper.get_mongo_connection
|
12
12
|
@companies_coll = @mongo_client['companies']
|
@@ -18,9 +18,18 @@ class TechAdTagger < Base
|
|
18
18
|
#iterates through array and updates company db record with technologies found from ads from their website
|
19
19
|
def tag_company_with_technologies tech_keywords
|
20
20
|
|
21
|
+
# uncomment if need to clear out all existing technologies
|
22
|
+
=begin
|
23
|
+
@companies_coll.find().each do |company|
|
24
|
+
company['languages'] = {}
|
25
|
+
@companies_coll.update({"_id" => company["_id"]}, company)
|
26
|
+
end
|
27
|
+
=end
|
28
|
+
|
21
29
|
companies = @companies_coll.find(
|
22
|
-
"languages" => {"$exists" => false}
|
23
|
-
|
30
|
+
# "languages" => {"$exists" => false}
|
31
|
+
# "languages" => {}
|
32
|
+
).to_a
|
24
33
|
|
25
34
|
languages = Hash.new
|
26
35
|
|
@@ -28,7 +37,7 @@ class TechAdTagger < Base
|
|
28
37
|
|
29
38
|
languages = Hash.new
|
30
39
|
|
31
|
-
company_languages_map = @search_client.
|
40
|
+
company_languages_map = @search_client.google_search tech_keywords, company["websiteUrl"]
|
32
41
|
|
33
42
|
company_languages_map.each do |key, value|
|
34
43
|
|
@@ -34,10 +34,10 @@ class DataPopulators
|
|
34
34
|
@@geo_tagger = GeoTagger.new @@log
|
35
35
|
@@gather_companies = GatherCompanies.new
|
36
36
|
@@companies_searcher = CompaniesSearcher.new @@geo_tagger
|
37
|
-
@@
|
37
|
+
@@tech_ad_tagger = TechAdTagger.new
|
38
38
|
|
39
39
|
#data holders
|
40
|
-
@@facet_location = "us:
|
40
|
+
@@facet_location = "us:84"
|
41
41
|
@@programming_languages = ["java", "ruby", "c#", "php", "python", "javascript"]
|
42
42
|
|
43
43
|
|
@@ -48,19 +48,34 @@ class DataPopulators
|
|
48
48
|
if __FILE__ == $PROGRAM_NAME
|
49
49
|
|
50
50
|
|
51
|
-
|
51
|
+
t1 = Thread.new do
|
52
52
|
|
53
|
-
|
53
|
+
begin
|
54
54
|
|
55
|
-
|
55
|
+
# @@gather_companies.load_companies_to_db 700, 0, @@facet_location
|
56
56
|
|
57
|
+
rescue Exception => e
|
58
|
+
puts e.message
|
59
|
+
puts e.backtrace
|
57
60
|
end
|
58
61
|
|
59
|
-
rescue Exception => e
|
60
|
-
puts e.message
|
61
|
-
puts e.backtrace
|
62
62
|
end
|
63
63
|
|
64
|
+
|
65
|
+
t2 = Thread.new do
|
66
|
+
|
67
|
+
begin
|
68
|
+
|
69
|
+
@@tech_ad_tagger.tag_company_with_technologies @@programming_languages
|
70
|
+
|
71
|
+
rescue Exception => e
|
72
|
+
puts e.message
|
73
|
+
puts e.backtrace
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
|
64
79
|
#this is necessary, for some reason, otherwise the process just gets killed, as the sub-threads are dependent on the main thread remaining alive
|
65
80
|
while true
|
66
81
|
sleep(5)
|
@@ -71,15 +86,15 @@ class DataPopulators
|
|
71
86
|
|
72
87
|
#examples:
|
73
88
|
|
74
|
-
#
|
89
|
+
# @@tech_ad_tagger.tag_company_with_technologies @@programming_languages
|
75
90
|
|
76
|
-
#
|
91
|
+
# @@gather_companies.load_companies_to_db 700, 0, @@facet_location
|
77
92
|
|
78
|
-
#
|
93
|
+
# @@geo_tagger.load_geolocations_into_db
|
79
94
|
|
80
|
-
#
|
95
|
+
# @@geo_tagger.update_companies_with_latitude_longitude
|
81
96
|
|
82
|
-
# near =
|
97
|
+
# near = @@companies_searcher.zip_code_search "95688"
|
83
98
|
|
84
99
|
# near = self_instance.companies_searcher.geospatial_search -122.4099154, 37.8059887
|
85
100
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: whos_using_what
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -91,6 +91,22 @@ dependencies:
|
|
91
91
|
- - ! '>='
|
92
92
|
- !ruby/object:Gem::Version
|
93
93
|
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: mechanize
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
94
110
|
- !ruby/object:Gem::Dependency
|
95
111
|
name: mongo
|
96
112
|
requirement: !ruby/object:Gem::Requirement
|
@@ -107,6 +123,38 @@ dependencies:
|
|
107
123
|
- - ! '>='
|
108
124
|
- !ruby/object:Gem::Version
|
109
125
|
version: '0'
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: watir-webdriver
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ! '>='
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
type: :runtime
|
135
|
+
prerelease: false
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
142
|
+
- !ruby/object:Gem::Dependency
|
143
|
+
name: headless
|
144
|
+
requirement: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
146
|
+
requirements:
|
147
|
+
- - ! '>='
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '0'
|
150
|
+
type: :runtime
|
151
|
+
prerelease: false
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ! '>='
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
110
158
|
description: What companies are using what technologies
|
111
159
|
email: r.dane1010@gmail.com
|
112
160
|
executables: []
|
@@ -117,9 +165,9 @@ files:
|
|
117
165
|
- lib/whos_using_what/util/map_data_extraction_util.rb
|
118
166
|
- lib/whos_using_what/logging/logger_factory.rb
|
119
167
|
- lib/whos_using_what/scripts/data_populators.rb
|
168
|
+
- lib/whos_using_what/api_clients/google_client.rb
|
120
169
|
- lib/whos_using_what/api_clients/base_api_client.rb
|
121
170
|
- lib/whos_using_what/api_clients/linkedin_client.rb
|
122
|
-
- lib/whos_using_what/api_clients/whos_using_what_search_client.rb
|
123
171
|
- lib/whos_using_what/api_clients/google_locations_client.rb
|
124
172
|
- lib/whos_using_what/data_gatherers/gather_companies.rb
|
125
173
|
- lib/whos_using_what/data_gatherers/geo_tagger.rb
|
@@ -1,137 +0,0 @@
|
|
1
|
-
require_relative "../base"
|
2
|
-
|
3
|
-
class WhosUsingWhatSearchClient < Base
|
4
|
-
|
5
|
-
require "uri"
|
6
|
-
require "rest-client"
|
7
|
-
|
8
|
-
attr :results
|
9
|
-
|
10
|
-
def initialize()
|
11
|
-
|
12
|
-
|
13
|
-
@negativeMatchUrlPatterns = Array.new.push("google.com").push("youtube.com")
|
14
|
-
|
15
|
-
@positiveMatchUrlPatterns = Array.new.push("http")
|
16
|
-
|
17
|
-
@technologiesToSearchFor = Array.new.push("ruby").push("java").push("javascript").push("python").push("c++").push("c#")
|
18
|
-
|
19
|
-
@jobPageTokens = Array.new.push("job", "hiring", "career")
|
20
|
-
|
21
|
-
@results = Hash.new
|
22
|
-
|
23
|
-
end
|
24
|
-
|
25
|
-
private
|
26
|
-
|
27
|
-
def extractUrls (rawInput, mustContainUrl)
|
28
|
-
|
29
|
-
acceptedUrls = Array.new
|
30
|
-
|
31
|
-
if (rawInput == nil)
|
32
|
-
return acceptedUrls
|
33
|
-
end
|
34
|
-
|
35
|
-
urls = []
|
36
|
-
|
37
|
-
begin
|
38
|
-
urls = URI.extract(rawInput)
|
39
|
-
end
|
40
|
-
|
41
|
-
if urls.size < 1
|
42
|
-
return urls
|
43
|
-
end
|
44
|
-
|
45
|
-
urls.each do |url|
|
46
|
-
add = true
|
47
|
-
@negativeMatchUrlPatterns.each do |token|
|
48
|
-
|
49
|
-
if (nil != url.index(token))
|
50
|
-
add = false
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
@positiveMatchUrlPatterns.each do |token|
|
55
|
-
|
56
|
-
if (nil == url.index(token) || url.index(token) > 0)
|
57
|
-
add = false
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
if (mustContainUrl != nil && url.index(mustContainUrl) == nil)
|
62
|
-
add = false
|
63
|
-
end
|
64
|
-
|
65
|
-
if (add)
|
66
|
-
acceptedUrls.push(url)
|
67
|
-
end
|
68
|
-
end
|
69
|
-
acceptedUrls
|
70
|
-
end
|
71
|
-
|
72
|
-
|
73
|
-
def arraySearch(array, rawHtml)
|
74
|
-
|
75
|
-
rawHtml = rawHtml.downcase
|
76
|
-
array.each do |token|
|
77
|
-
if (rawHtml.index(token) != nil)
|
78
|
-
return true
|
79
|
-
end
|
80
|
-
end
|
81
|
-
return false
|
82
|
-
end
|
83
|
-
|
84
|
-
|
85
|
-
def determineIfUsesTechnology(technology, rawHtml)
|
86
|
-
|
87
|
-
isJobPage = arraySearch(@jobPageTokens, rawHtml)
|
88
|
-
|
89
|
-
return isJobPage
|
90
|
-
|
91
|
-
end
|
92
|
-
|
93
|
-
public
|
94
|
-
|
95
|
-
|
96
|
-
#performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
|
97
|
-
#If an ad exists it is returned as part of map
|
98
|
-
def search queries, url
|
99
|
-
|
100
|
-
begin
|
101
|
-
rawHtml = RestClient.get(url)
|
102
|
-
rescue
|
103
|
-
|
104
|
-
end
|
105
|
-
|
106
|
-
urls = extractUrls(rawHtml, url)
|
107
|
-
|
108
|
-
matching_url = nil
|
109
|
-
|
110
|
-
ret_map = Hash.new
|
111
|
-
|
112
|
-
urls.each do |cur_url|
|
113
|
-
begin
|
114
|
-
html = RestClient.get(cur_url)
|
115
|
-
|
116
|
-
queries.each do |query|
|
117
|
-
|
118
|
-
url = "https://www.google.com/search?hl=en&as_q=" << query << "&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all&as_sitesearch=" << cur_url << "&as_occt=any&safe=off&tbs=&as_filetype=&as_rights="
|
119
|
-
|
120
|
-
uses_technology = determineIfUsesTechnology(query, html)
|
121
|
-
|
122
|
-
if (uses_technology)
|
123
|
-
ret_map[query] = cur_url
|
124
|
-
end
|
125
|
-
|
126
|
-
end
|
127
|
-
|
128
|
-
rescue Exception => exception
|
129
|
-
#don't really care at this point, probably not worth logging as some sites just don't end up loading properly
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
ret_map
|
134
|
-
end
|
135
|
-
|
136
|
-
end
|
137
|
-
|