whos_using_what 0.3.1 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,6 +2,60 @@ require_relative "../base"
2
2
 
3
3
  class BaseApiClient < Base
4
4
 
5
+ require "uri"
6
+ require "rest-client"
7
+
8
+ def arraySearch(array, rawHtml)
9
+
10
+ rawHtml = rawHtml.downcase
11
+ array.each do |token|
12
+ if (rawHtml.index(token) != nil)
13
+ return true
14
+ end
15
+ end
16
+ return false
17
+ end
18
+
19
+ def arry_to_str_delim array, delim
20
+
21
+ str = ""
22
+ i = 0
23
+ array.each do |entry|
24
+ if i < 1
25
+ str = entry.strip
26
+
27
+ else
28
+ str = str << delim << entry.strip
29
+ end
30
+ i += 1
31
+ end
32
+
33
+ str.strip
34
+ end
35
+
36
+
37
+ def cleanup_url url
38
+ #clean up url
39
+ url = url.strip
40
+ if url["www."] != nil
41
+ url["www."] = ""
42
+ end
43
+ if url["site:"] != nil
44
+ url["site:"] = ""
45
+ end
46
+ url
47
+
48
+ end
49
+
50
+ def determineIfUsesTechnology(technology, rawHtml)
51
+
52
+ isJobPage = arraySearch(@jobPageTokens, rawHtml)
53
+
54
+ return isJobPage
55
+
56
+ end
57
+
58
+
5
59
  def starts_with?(string, prefix)
6
60
  prefix = prefix.to_s
7
61
  string[0, prefix.length] == prefix
@@ -0,0 +1,177 @@
1
+ require_relative "base_api_client"
2
+ require 'mechanize'
3
+ require 'watir-webdriver'
4
+ require 'headless'
5
+
6
+ class GoogleClient < BaseApiClient
7
+
8
+ attr :results
9
+
10
+ def initialize
11
+
12
+ @negativeMatchUrlPatterns = ['google', 'youtube', 'duckduckgo', 'bing', 'yahoo']
13
+
14
+ @positiveMatchUrlPatterns = ['http', 'www']
15
+
16
+ @technologiesToSearchFor = ['ruby', 'java', 'javascript', 'python']
17
+
18
+ @jobPageTokens = ['job', 'hiring', 'career']
19
+
20
+ @results = Hash.new
21
+
22
+ @mechanize = Mechanize.new
23
+
24
+ headless = Headless.new
25
+ headless.start
26
+ @browser = Watir::Browser.new :firefox
27
+
28
+
29
+ end
30
+
31
+
32
+ def extractUrls (rawInput, mustContainUrl)
33
+
34
+ acceptedUrls = Array.new
35
+
36
+ if (rawInput == nil)
37
+ return acceptedUrls
38
+ end
39
+
40
+ urls = []
41
+
42
+ begin
43
+ urls = URI.extract(rawInput)
44
+ end
45
+
46
+ if urls.size < 1
47
+ return acceptedUrls
48
+ end
49
+
50
+ mustContainUrl = cleanup_url mustContainUrl
51
+
52
+ urls.each do |url|
53
+
54
+ url = cleanup_url url
55
+
56
+ accept_url_bool = false
57
+
58
+ @positiveMatchUrlPatterns.each do |token|
59
+ if (starts_with? url, token) ||
60
+ (starts_with? url, mustContainUrl)
61
+ accept_url_bool = true
62
+ break
63
+ end
64
+ end
65
+
66
+ if !accept_url_bool
67
+ next
68
+ end
69
+
70
+ if !(url.include? mustContainUrl)
71
+ accept_url_bool = false
72
+ end
73
+
74
+ @negativeMatchUrlPatterns.each do |token|
75
+ if url.include? token
76
+ accept_url_bool = false
77
+ break
78
+ end
79
+ end
80
+
81
+ url = cleanup_url url
82
+
83
+ if accept_url_bool &&
84
+ url != nil && !(acceptedUrls.include? url)
85
+ acceptedUrls.push url
86
+ end
87
+ end
88
+ acceptedUrls
89
+ end
90
+
91
+ def generate_google_url site_url, search_keyword
92
+ query_url = [
93
+ "http://www.google.com/search?",
94
+ "hl=en",
95
+ "&as_q=" << "hiring+" << search_keyword,
96
+ "&as_sitesearch=" << (cleanup_url (site_url))
97
+ ]
98
+
99
+ url = arry_to_str_delim query_url, ""
100
+
101
+ end
102
+
103
+ def generate_duckduckgo_url site_url, search_keyword
104
+
105
+ query_url = [
106
+ "http://duckduckgo.com/?",
107
+ "q=" <<
108
+ "site:" << (cleanup_url (site_url)) <<
109
+ "+hiring+" << search_keyword,
110
+ ]
111
+
112
+ url = arry_to_str_delim query_url, ""
113
+ end
114
+
115
+ #performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
116
+ #If an ad exists it is returned as part of map
117
+ def google_search queries, site_url
118
+
119
+ ret_map = Hash.new
120
+
121
+ @technologiesToSearchFor.each do |search_keyword|
122
+
123
+ url = ""
124
+ raw_html = ""
125
+
126
+ begin
127
+
128
+ url = generate_duckduckgo_url site_url, search_keyword
129
+
130
+ #perform initial search engine search
131
+ @browser.goto url
132
+ raw_html = @browser.html
133
+
134
+ puts "successfully queried url:" << url
135
+
136
+ rescue Exception => e
137
+ puts "exception:" << e.message << " when querying url: " << url
138
+ end
139
+
140
+ urls = extractUrls(raw_html, site_url)
141
+
142
+ urls.each do |cur_url|
143
+
144
+ begin
145
+
146
+ @browser.goto cur_url
147
+ html = @browser.html
148
+
149
+ #strip all html tags, for human readability and to cut down on some errors that could arise
150
+ # TODO this was causing an exception
151
+ # html = html.gsub!(/(<[^>]*>)|\n|\t/s) { " " }
152
+
153
+ uses_technology = determineIfUsesTechnology(search_keyword, html)
154
+
155
+ if (uses_technology)
156
+ ret_map[search_keyword] = cur_url
157
+ end
158
+
159
+ rescue Exception => e
160
+
161
+ puts e.message
162
+
163
+ end
164
+ end
165
+
166
+ end
167
+
168
+ #throttle queries to avoid being black-listed by search engine
169
+ sleep_seconds = rand(1-5)
170
+ sleep sleep_seconds
171
+
172
+ ret_map
173
+
174
+ end
175
+
176
+ end
177
+
@@ -1,5 +1,9 @@
1
1
  class Base
2
2
 
3
+ attr :set_paths
4
+
5
+ @@paths_set = false
6
+
3
7
  def self.set_paths
4
8
  $:.unshift(File.expand_path('../data_gatherers', __FILE__))
5
9
  $:.unshift(File.expand_path('../data_searchers', __FILE__))
@@ -8,9 +12,9 @@ class Base
8
12
  $:.unshift(File.expand_path('../util', __FILE__))
9
13
  $:.unshift(File.expand_path('../logging', __FILE__))
10
14
 
15
+ @@paths_set = true
11
16
  end
12
17
 
13
18
  set_paths
14
19
 
15
-
16
20
  end
@@ -2,11 +2,11 @@ require_relative "../base"
2
2
 
3
3
  class TechAdTagger < Base
4
4
 
5
- require 'whos_using_what_search_client'
5
+ def initialize
6
6
 
7
+ require_relative '../api_clients/google_client'
7
8
 
8
- def initialize
9
- @search_client = WhosUsingWhatSearchClient.new
9
+ @search_client = GoogleClient.new
10
10
 
11
11
  @mongo_client = MongoHelper.get_mongo_connection
12
12
  @companies_coll = @mongo_client['companies']
@@ -18,9 +18,18 @@ class TechAdTagger < Base
18
18
  #iterates through array and updates company db record with technologies found from ads from their website
19
19
  def tag_company_with_technologies tech_keywords
20
20
 
21
+ # uncomment if need to clear out all existing technologies
22
+ =begin
23
+ @companies_coll.find().each do |company|
24
+ company['languages'] = {}
25
+ @companies_coll.update({"_id" => company["_id"]}, company)
26
+ end
27
+ =end
28
+
21
29
  companies = @companies_coll.find(
22
- "languages" => {"$exists" => false}
23
- )
30
+ # "languages" => {"$exists" => false}
31
+ # "languages" => {}
32
+ ).to_a
24
33
 
25
34
  languages = Hash.new
26
35
 
@@ -28,7 +37,7 @@ class TechAdTagger < Base
28
37
 
29
38
  languages = Hash.new
30
39
 
31
- company_languages_map = @search_client.search tech_keywords, company["websiteUrl"]
40
+ company_languages_map = @search_client.google_search tech_keywords, company["websiteUrl"]
32
41
 
33
42
  company_languages_map.each do |key, value|
34
43
 
@@ -34,10 +34,10 @@ class DataPopulators
34
34
  @@geo_tagger = GeoTagger.new @@log
35
35
  @@gather_companies = GatherCompanies.new
36
36
  @@companies_searcher = CompaniesSearcher.new @@geo_tagger
37
- @@ech_ad_tagger = TechAdTagger.new
37
+ @@tech_ad_tagger = TechAdTagger.new
38
38
 
39
39
  #data holders
40
- @@facet_location = "us:82" #Sacramento
40
+ @@facet_location = "us:84"
41
41
  @@programming_languages = ["java", "ruby", "c#", "php", "python", "javascript"]
42
42
 
43
43
 
@@ -48,19 +48,34 @@ class DataPopulators
48
48
  if __FILE__ == $PROGRAM_NAME
49
49
 
50
50
 
51
- begin
51
+ t1 = Thread.new do
52
52
 
53
- t1 = Thread.new do
53
+ begin
54
54
 
55
- @@geo_tagger.load_geolocations_into_db
55
+ # @@gather_companies.load_companies_to_db 700, 0, @@facet_location
56
56
 
57
+ rescue Exception => e
58
+ puts e.message
59
+ puts e.backtrace
57
60
  end
58
61
 
59
- rescue Exception => e
60
- puts e.message
61
- puts e.backtrace
62
62
  end
63
63
 
64
+
65
+ t2 = Thread.new do
66
+
67
+ begin
68
+
69
+ @@tech_ad_tagger.tag_company_with_technologies @@programming_languages
70
+
71
+ rescue Exception => e
72
+ puts e.message
73
+ puts e.backtrace
74
+ end
75
+
76
+ end
77
+
78
+
64
79
  #this is necessary, for some reason, otherwise the process just gets killed, as the sub-threads are dependent on the main thread remaining alive
65
80
  while true
66
81
  sleep(5)
@@ -71,15 +86,15 @@ class DataPopulators
71
86
 
72
87
  #examples:
73
88
 
74
- # self_instance.tech_ad_tagger.tag_company_with_technologies self_instance.programming_languages
89
+ # @@tech_ad_tagger.tag_company_with_technologies @@programming_languages
75
90
 
76
- # self_instance.gather_companies.load_companies_to_db 700, 0, facet_location
91
+ # @@gather_companies.load_companies_to_db 700, 0, @@facet_location
77
92
 
78
- # self_instance.geo_tagger.load_geolocations_into_db
93
+ # @@geo_tagger.load_geolocations_into_db
79
94
 
80
- # self_instance.geo_tagger.update_companies_with_latitude_longitude
95
+ # @@geo_tagger.update_companies_with_latitude_longitude
81
96
 
82
- # near = self_instance.companies_searcher.zip_code_search "95688"
97
+ # near = @@companies_searcher.zip_code_search "95688"
83
98
 
84
99
  # near = self_instance.companies_searcher.geospatial_search -122.4099154, 37.8059887
85
100
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whos_using_what
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -91,6 +91,22 @@ dependencies:
91
91
  - - ! '>='
92
92
  - !ruby/object:Gem::Version
93
93
  version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: mechanize
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
94
110
  - !ruby/object:Gem::Dependency
95
111
  name: mongo
96
112
  requirement: !ruby/object:Gem::Requirement
@@ -107,6 +123,38 @@ dependencies:
107
123
  - - ! '>='
108
124
  - !ruby/object:Gem::Version
109
125
  version: '0'
126
+ - !ruby/object:Gem::Dependency
127
+ name: watir-webdriver
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ type: :runtime
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
142
+ - !ruby/object:Gem::Dependency
143
+ name: headless
144
+ requirement: !ruby/object:Gem::Requirement
145
+ none: false
146
+ requirements:
147
+ - - ! '>='
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ type: :runtime
151
+ prerelease: false
152
+ version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
154
+ requirements:
155
+ - - ! '>='
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
110
158
  description: What companies are using what technologies
111
159
  email: r.dane1010@gmail.com
112
160
  executables: []
@@ -117,9 +165,9 @@ files:
117
165
  - lib/whos_using_what/util/map_data_extraction_util.rb
118
166
  - lib/whos_using_what/logging/logger_factory.rb
119
167
  - lib/whos_using_what/scripts/data_populators.rb
168
+ - lib/whos_using_what/api_clients/google_client.rb
120
169
  - lib/whos_using_what/api_clients/base_api_client.rb
121
170
  - lib/whos_using_what/api_clients/linkedin_client.rb
122
- - lib/whos_using_what/api_clients/whos_using_what_search_client.rb
123
171
  - lib/whos_using_what/api_clients/google_locations_client.rb
124
172
  - lib/whos_using_what/data_gatherers/gather_companies.rb
125
173
  - lib/whos_using_what/data_gatherers/geo_tagger.rb
@@ -1,137 +0,0 @@
1
- require_relative "../base"
2
-
3
- class WhosUsingWhatSearchClient < Base
4
-
5
- require "uri"
6
- require "rest-client"
7
-
8
- attr :results
9
-
10
- def initialize()
11
-
12
-
13
- @negativeMatchUrlPatterns = Array.new.push("google.com").push("youtube.com")
14
-
15
- @positiveMatchUrlPatterns = Array.new.push("http")
16
-
17
- @technologiesToSearchFor = Array.new.push("ruby").push("java").push("javascript").push("python").push("c++").push("c#")
18
-
19
- @jobPageTokens = Array.new.push("job", "hiring", "career")
20
-
21
- @results = Hash.new
22
-
23
- end
24
-
25
- private
26
-
27
- def extractUrls (rawInput, mustContainUrl)
28
-
29
- acceptedUrls = Array.new
30
-
31
- if (rawInput == nil)
32
- return acceptedUrls
33
- end
34
-
35
- urls = []
36
-
37
- begin
38
- urls = URI.extract(rawInput)
39
- end
40
-
41
- if urls.size < 1
42
- return urls
43
- end
44
-
45
- urls.each do |url|
46
- add = true
47
- @negativeMatchUrlPatterns.each do |token|
48
-
49
- if (nil != url.index(token))
50
- add = false
51
- end
52
- end
53
-
54
- @positiveMatchUrlPatterns.each do |token|
55
-
56
- if (nil == url.index(token) || url.index(token) > 0)
57
- add = false
58
- end
59
- end
60
-
61
- if (mustContainUrl != nil && url.index(mustContainUrl) == nil)
62
- add = false
63
- end
64
-
65
- if (add)
66
- acceptedUrls.push(url)
67
- end
68
- end
69
- acceptedUrls
70
- end
71
-
72
-
73
- def arraySearch(array, rawHtml)
74
-
75
- rawHtml = rawHtml.downcase
76
- array.each do |token|
77
- if (rawHtml.index(token) != nil)
78
- return true
79
- end
80
- end
81
- return false
82
- end
83
-
84
-
85
- def determineIfUsesTechnology(technology, rawHtml)
86
-
87
- isJobPage = arraySearch(@jobPageTokens, rawHtml)
88
-
89
- return isJobPage
90
-
91
- end
92
-
93
- public
94
-
95
-
96
- #performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
97
- #If an ad exists it is returned as part of map
98
- def search queries, url
99
-
100
- begin
101
- rawHtml = RestClient.get(url)
102
- rescue
103
-
104
- end
105
-
106
- urls = extractUrls(rawHtml, url)
107
-
108
- matching_url = nil
109
-
110
- ret_map = Hash.new
111
-
112
- urls.each do |cur_url|
113
- begin
114
- html = RestClient.get(cur_url)
115
-
116
- queries.each do |query|
117
-
118
- url = "https://www.google.com/search?hl=en&as_q=" << query << "&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all&as_sitesearch=" << cur_url << "&as_occt=any&safe=off&tbs=&as_filetype=&as_rights="
119
-
120
- uses_technology = determineIfUsesTechnology(query, html)
121
-
122
- if (uses_technology)
123
- ret_map[query] = cur_url
124
- end
125
-
126
- end
127
-
128
- rescue Exception => exception
129
- #don't really care at this point, probably not worth logging as some sites just don't end up loading properly
130
- end
131
- end
132
-
133
- ret_map
134
- end
135
-
136
- end
137
-