whos_using_what 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,60 @@ require_relative "../base"
2
2
 
3
3
  class BaseApiClient < Base
4
4
 
5
+ require "uri"
6
+ require "rest-client"
7
+
8
+ def arraySearch(array, rawHtml)
9
+
10
+ rawHtml = rawHtml.downcase
11
+ array.each do |token|
12
+ if (rawHtml.index(token) != nil)
13
+ return true
14
+ end
15
+ end
16
+ return false
17
+ end
18
+
19
+ def arry_to_str_delim array, delim
20
+
21
+ str = ""
22
+ i = 0
23
+ array.each do |entry|
24
+ if i < 1
25
+ str = entry.strip
26
+
27
+ else
28
+ str = str << delim << entry.strip
29
+ end
30
+ i += 1
31
+ end
32
+
33
+ str.strip
34
+ end
35
+
36
+
37
+ def cleanup_url url
38
+ #clean up url
39
+ url = url.strip
40
+ if url["www."] != nil
41
+ url["www."] = ""
42
+ end
43
+ if url["site:"] != nil
44
+ url["site:"] = ""
45
+ end
46
+ url
47
+
48
+ end
49
+
50
+ def determineIfUsesTechnology(technology, rawHtml)
51
+
52
+ isJobPage = arraySearch(@jobPageTokens, rawHtml)
53
+
54
+ return isJobPage
55
+
56
+ end
57
+
58
+
5
59
  def starts_with?(string, prefix)
6
60
  prefix = prefix.to_s
7
61
  string[0, prefix.length] == prefix
@@ -0,0 +1,177 @@
1
+ require_relative "base_api_client"
2
+ require 'mechanize'
3
+ require 'watir-webdriver'
4
+ require 'headless'
5
+
6
+ class GoogleClient < BaseApiClient
7
+
8
+ attr :results
9
+
10
+ def initialize
11
+
12
+ @negativeMatchUrlPatterns = ['google', 'youtube', 'duckduckgo', 'bing', 'yahoo']
13
+
14
+ @positiveMatchUrlPatterns = ['http', 'www']
15
+
16
+ @technologiesToSearchFor = ['ruby', 'java', 'javascript', 'python']
17
+
18
+ @jobPageTokens = ['job', 'hiring', 'career']
19
+
20
+ @results = Hash.new
21
+
22
+ @mechanize = Mechanize.new
23
+
24
+ headless = Headless.new
25
+ headless.start
26
+ @browser = Watir::Browser.new :firefox
27
+
28
+
29
+ end
30
+
31
+
32
+ def extractUrls (rawInput, mustContainUrl)
33
+
34
+ acceptedUrls = Array.new
35
+
36
+ if (rawInput == nil)
37
+ return acceptedUrls
38
+ end
39
+
40
+ urls = []
41
+
42
+ begin
43
+ urls = URI.extract(rawInput)
44
+ end
45
+
46
+ if urls.size < 1
47
+ return acceptedUrls
48
+ end
49
+
50
+ mustContainUrl = cleanup_url mustContainUrl
51
+
52
+ urls.each do |url|
53
+
54
+ url = cleanup_url url
55
+
56
+ accept_url_bool = false
57
+
58
+ @positiveMatchUrlPatterns.each do |token|
59
+ if (starts_with? url, token) ||
60
+ (starts_with? url, mustContainUrl)
61
+ accept_url_bool = true
62
+ break
63
+ end
64
+ end
65
+
66
+ if !accept_url_bool
67
+ next
68
+ end
69
+
70
+ if !(url.include? mustContainUrl)
71
+ accept_url_bool = false
72
+ end
73
+
74
+ @negativeMatchUrlPatterns.each do |token|
75
+ if url.include? token
76
+ accept_url_bool = false
77
+ break
78
+ end
79
+ end
80
+
81
+ url = cleanup_url url
82
+
83
+ if accept_url_bool &&
84
+ url != nil && !(acceptedUrls.include? url)
85
+ acceptedUrls.push url
86
+ end
87
+ end
88
+ acceptedUrls
89
+ end
90
+
91
+ def generate_google_url site_url, search_keyword
92
+ query_url = [
93
+ "http://www.google.com/search?",
94
+ "hl=en",
95
+ "&as_q=" << "hiring+" << search_keyword,
96
+ "&as_sitesearch=" << (cleanup_url (site_url))
97
+ ]
98
+
99
+ url = arry_to_str_delim query_url, ""
100
+
101
+ end
102
+
103
+ def generate_duckduckgo_url site_url, search_keyword
104
+
105
+ query_url = [
106
+ "http://duckduckgo.com/?",
107
+ "q=" <<
108
+ "site:" << (cleanup_url (site_url)) <<
109
+ "+hiring+" << search_keyword,
110
+ ]
111
+
112
+ url = arry_to_str_delim query_url, ""
113
+ end
114
+
115
+ #performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
116
+ #If an ad exists it is returned as part of map
117
+ def google_search queries, site_url
118
+
119
+ ret_map = Hash.new
120
+
121
+ @technologiesToSearchFor.each do |search_keyword|
122
+
123
+ url = ""
124
+ raw_html = ""
125
+
126
+ begin
127
+
128
+ url = generate_duckduckgo_url site_url, search_keyword
129
+
130
+ #perform initial search engine search
131
+ @browser.goto url
132
+ raw_html = @browser.html
133
+
134
+ puts "successfully queried url:" << url
135
+
136
+ rescue Exception => e
137
+ puts "exception:" << e.message << " when querying url: " << url
138
+ end
139
+
140
+ urls = extractUrls(raw_html, site_url)
141
+
142
+ urls.each do |cur_url|
143
+
144
+ begin
145
+
146
+ @browser.goto cur_url
147
+ html = @browser.html
148
+
149
+ #strip all html tags, for human readability and to cut down on some errors that could arise
150
+ # TODO this was causing an exception
151
+ # html = html.gsub!(/(<[^>]*>)|\n|\t/s) { " " }
152
+
153
+ uses_technology = determineIfUsesTechnology(search_keyword, html)
154
+
155
+ if (uses_technology)
156
+ ret_map[search_keyword] = cur_url
157
+ end
158
+
159
+ rescue Exception => e
160
+
161
+ puts e.message
162
+
163
+ end
164
+ end
165
+
166
+ end
167
+
168
+ #throttle queries to avoid being black-listed by search engine
169
+ sleep_seconds = rand(1-5)
170
+ sleep sleep_seconds
171
+
172
+ ret_map
173
+
174
+ end
175
+
176
+ end
177
+
@@ -1,5 +1,9 @@
1
1
  class Base
2
2
 
3
+ attr :set_paths
4
+
5
+ @@paths_set = false
6
+
3
7
  def self.set_paths
4
8
  $:.unshift(File.expand_path('../data_gatherers', __FILE__))
5
9
  $:.unshift(File.expand_path('../data_searchers', __FILE__))
@@ -8,9 +12,9 @@ class Base
8
12
  $:.unshift(File.expand_path('../util', __FILE__))
9
13
  $:.unshift(File.expand_path('../logging', __FILE__))
10
14
 
15
+ @@paths_set = true
11
16
  end
12
17
 
13
18
  set_paths
14
19
 
15
-
16
20
  end
@@ -2,11 +2,11 @@ require_relative "../base"
2
2
 
3
3
  class TechAdTagger < Base
4
4
 
5
- require 'whos_using_what_search_client'
5
+ def initialize
6
6
 
7
+ require_relative '../api_clients/google_client'
7
8
 
8
- def initialize
9
- @search_client = WhosUsingWhatSearchClient.new
9
+ @search_client = GoogleClient.new
10
10
 
11
11
  @mongo_client = MongoHelper.get_mongo_connection
12
12
  @companies_coll = @mongo_client['companies']
@@ -18,9 +18,18 @@ class TechAdTagger < Base
18
18
  #iterates through array and updates company db record with technologies found from ads from their website
19
19
  def tag_company_with_technologies tech_keywords
20
20
 
21
+ # uncomment if need to clear out all existing technologies
22
+ =begin
23
+ @companies_coll.find().each do |company|
24
+ company['languages'] = {}
25
+ @companies_coll.update({"_id" => company["_id"]}, company)
26
+ end
27
+ =end
28
+
21
29
  companies = @companies_coll.find(
22
- "languages" => {"$exists" => false}
23
- )
30
+ # "languages" => {"$exists" => false}
31
+ # "languages" => {}
32
+ ).to_a
24
33
 
25
34
  languages = Hash.new
26
35
 
@@ -28,7 +37,7 @@ class TechAdTagger < Base
28
37
 
29
38
  languages = Hash.new
30
39
 
31
- company_languages_map = @search_client.search tech_keywords, company["websiteUrl"]
40
+ company_languages_map = @search_client.google_search tech_keywords, company["websiteUrl"]
32
41
 
33
42
  company_languages_map.each do |key, value|
34
43
 
@@ -34,10 +34,10 @@ class DataPopulators
34
34
  @@geo_tagger = GeoTagger.new @@log
35
35
  @@gather_companies = GatherCompanies.new
36
36
  @@companies_searcher = CompaniesSearcher.new @@geo_tagger
37
- @@ech_ad_tagger = TechAdTagger.new
37
+ @@tech_ad_tagger = TechAdTagger.new
38
38
 
39
39
  #data holders
40
- @@facet_location = "us:82" #Sacramento
40
+ @@facet_location = "us:84"
41
41
  @@programming_languages = ["java", "ruby", "c#", "php", "python", "javascript"]
42
42
 
43
43
 
@@ -48,19 +48,34 @@ class DataPopulators
48
48
  if __FILE__ == $PROGRAM_NAME
49
49
 
50
50
 
51
- begin
51
+ t1 = Thread.new do
52
52
 
53
- t1 = Thread.new do
53
+ begin
54
54
 
55
- @@geo_tagger.load_geolocations_into_db
55
+ # @@gather_companies.load_companies_to_db 700, 0, @@facet_location
56
56
 
57
+ rescue Exception => e
58
+ puts e.message
59
+ puts e.backtrace
57
60
  end
58
61
 
59
- rescue Exception => e
60
- puts e.message
61
- puts e.backtrace
62
62
  end
63
63
 
64
+
65
+ t2 = Thread.new do
66
+
67
+ begin
68
+
69
+ @@tech_ad_tagger.tag_company_with_technologies @@programming_languages
70
+
71
+ rescue Exception => e
72
+ puts e.message
73
+ puts e.backtrace
74
+ end
75
+
76
+ end
77
+
78
+
64
79
  #this is necessary, for some reason, otherwise the process just gets killed, as the sub-threads are dependent on the main thread remaining alive
65
80
  while true
66
81
  sleep(5)
@@ -71,15 +86,15 @@ class DataPopulators
71
86
 
72
87
  #examples:
73
88
 
74
- # self_instance.tech_ad_tagger.tag_company_with_technologies self_instance.programming_languages
89
+ # @@tech_ad_tagger.tag_company_with_technologies @@programming_languages
75
90
 
76
- # self_instance.gather_companies.load_companies_to_db 700, 0, facet_location
91
+ # @@gather_companies.load_companies_to_db 700, 0, @@facet_location
77
92
 
78
- # self_instance.geo_tagger.load_geolocations_into_db
93
+ # @@geo_tagger.load_geolocations_into_db
79
94
 
80
- # self_instance.geo_tagger.update_companies_with_latitude_longitude
95
+ # @@geo_tagger.update_companies_with_latitude_longitude
81
96
 
82
- # near = self_instance.companies_searcher.zip_code_search "95688"
97
+ # near = @@companies_searcher.zip_code_search "95688"
83
98
 
84
99
  # near = self_instance.companies_searcher.geospatial_search -122.4099154, 37.8059887
85
100
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whos_using_what
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -91,6 +91,22 @@ dependencies:
91
91
  - - ! '>='
92
92
  - !ruby/object:Gem::Version
93
93
  version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: mechanize
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
94
110
  - !ruby/object:Gem::Dependency
95
111
  name: mongo
96
112
  requirement: !ruby/object:Gem::Requirement
@@ -107,6 +123,38 @@ dependencies:
107
123
  - - ! '>='
108
124
  - !ruby/object:Gem::Version
109
125
  version: '0'
126
+ - !ruby/object:Gem::Dependency
127
+ name: watir-webdriver
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ type: :runtime
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
142
+ - !ruby/object:Gem::Dependency
143
+ name: headless
144
+ requirement: !ruby/object:Gem::Requirement
145
+ none: false
146
+ requirements:
147
+ - - ! '>='
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ type: :runtime
151
+ prerelease: false
152
+ version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
154
+ requirements:
155
+ - - ! '>='
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
110
158
  description: What companies are using what technologies
111
159
  email: r.dane1010@gmail.com
112
160
  executables: []
@@ -117,9 +165,9 @@ files:
117
165
  - lib/whos_using_what/util/map_data_extraction_util.rb
118
166
  - lib/whos_using_what/logging/logger_factory.rb
119
167
  - lib/whos_using_what/scripts/data_populators.rb
168
+ - lib/whos_using_what/api_clients/google_client.rb
120
169
  - lib/whos_using_what/api_clients/base_api_client.rb
121
170
  - lib/whos_using_what/api_clients/linkedin_client.rb
122
- - lib/whos_using_what/api_clients/whos_using_what_search_client.rb
123
171
  - lib/whos_using_what/api_clients/google_locations_client.rb
124
172
  - lib/whos_using_what/data_gatherers/gather_companies.rb
125
173
  - lib/whos_using_what/data_gatherers/geo_tagger.rb
@@ -1,137 +0,0 @@
1
- require_relative "../base"
2
-
3
- class WhosUsingWhatSearchClient < Base
4
-
5
- require "uri"
6
- require "rest-client"
7
-
8
- attr :results
9
-
10
- def initialize()
11
-
12
-
13
- @negativeMatchUrlPatterns = Array.new.push("google.com").push("youtube.com")
14
-
15
- @positiveMatchUrlPatterns = Array.new.push("http")
16
-
17
- @technologiesToSearchFor = Array.new.push("ruby").push("java").push("javascript").push("python").push("c++").push("c#")
18
-
19
- @jobPageTokens = Array.new.push("job", "hiring", "career")
20
-
21
- @results = Hash.new
22
-
23
- end
24
-
25
- private
26
-
27
- def extractUrls (rawInput, mustContainUrl)
28
-
29
- acceptedUrls = Array.new
30
-
31
- if (rawInput == nil)
32
- return acceptedUrls
33
- end
34
-
35
- urls = []
36
-
37
- begin
38
- urls = URI.extract(rawInput)
39
- end
40
-
41
- if urls.size < 1
42
- return urls
43
- end
44
-
45
- urls.each do |url|
46
- add = true
47
- @negativeMatchUrlPatterns.each do |token|
48
-
49
- if (nil != url.index(token))
50
- add = false
51
- end
52
- end
53
-
54
- @positiveMatchUrlPatterns.each do |token|
55
-
56
- if (nil == url.index(token) || url.index(token) > 0)
57
- add = false
58
- end
59
- end
60
-
61
- if (mustContainUrl != nil && url.index(mustContainUrl) == nil)
62
- add = false
63
- end
64
-
65
- if (add)
66
- acceptedUrls.push(url)
67
- end
68
- end
69
- acceptedUrls
70
- end
71
-
72
-
73
- def arraySearch(array, rawHtml)
74
-
75
- rawHtml = rawHtml.downcase
76
- array.each do |token|
77
- if (rawHtml.index(token) != nil)
78
- return true
79
- end
80
- end
81
- return false
82
- end
83
-
84
-
85
- def determineIfUsesTechnology(technology, rawHtml)
86
-
87
- isJobPage = arraySearch(@jobPageTokens, rawHtml)
88
-
89
- return isJobPage
90
-
91
- end
92
-
93
- public
94
-
95
-
96
- #performs a search engine search that is restricted to a company's website and then attempts to determine if they have job listings for a given technology.
97
- #If an ad exists it is returned as part of map
98
- def search queries, url
99
-
100
- begin
101
- rawHtml = RestClient.get(url)
102
- rescue
103
-
104
- end
105
-
106
- urls = extractUrls(rawHtml, url)
107
-
108
- matching_url = nil
109
-
110
- ret_map = Hash.new
111
-
112
- urls.each do |cur_url|
113
- begin
114
- html = RestClient.get(cur_url)
115
-
116
- queries.each do |query|
117
-
118
- url = "https://www.google.com/search?hl=en&as_q=" << query << "&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all&as_sitesearch=" << cur_url << "&as_occt=any&safe=off&tbs=&as_filetype=&as_rights="
119
-
120
- uses_technology = determineIfUsesTechnology(query, html)
121
-
122
- if (uses_technology)
123
- ret_map[query] = cur_url
124
- end
125
-
126
- end
127
-
128
- rescue Exception => exception
129
- #don't really care at this point, probably not worth logging as some sites just don't end up loading properly
130
- end
131
- end
132
-
133
- ret_map
134
- end
135
-
136
- end
137
-