framework_guesser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +45 -0
  5. data/Rakefile +1 -0
  6. data/framework_guesser.gemspec +24 -0
  7. data/lib/framework_guesser.rb +9 -0
  8. data/lib/framework_guesser/matcher.rb +327 -0
  9. data/lib/framework_guesser/version.rb +3 -0
  10. data/spec/fixtures/adsense-1.html +567 -0
  11. data/spec/fixtures/adsense-2.html +200 -0
  12. data/spec/fixtures/drupal-1.html +294 -0
  13. data/spec/fixtures/drupal-1.yaml +9 -0
  14. data/spec/fixtures/drupal-2.html +296 -0
  15. data/spec/fixtures/drupal-2.yaml +9 -0
  16. data/spec/fixtures/drupal-3.html +161 -0
  17. data/spec/fixtures/drupal-3.yaml +11 -0
  18. data/spec/fixtures/drupal-4.html +173 -0
  19. data/spec/fixtures/drupal-4.yaml +9 -0
  20. data/spec/fixtures/drupal-5.html +159 -0
  21. data/spec/fixtures/drupal-5.yaml +9 -0
  22. data/spec/fixtures/drupal-6.html +199 -0
  23. data/spec/fixtures/drupal-7.html +83 -0
  24. data/spec/fixtures/etarget-1.html +206 -0
  25. data/spec/fixtures/etarget-2.html +369 -0
  26. data/spec/fixtures/etarget-3.html +430 -0
  27. data/spec/fixtures/facebook-1.html +1256 -0
  28. data/spec/fixtures/facebook-1.yaml +11 -0
  29. data/spec/fixtures/facebook-2.html +119 -0
  30. data/spec/fixtures/facebook-2.yaml +11 -0
  31. data/spec/fixtures/facebook-3.html +315 -0
  32. data/spec/fixtures/facebook-3.yaml +8 -0
  33. data/spec/fixtures/opencart-1.html +625 -0
  34. data/spec/fixtures/opencart-2.html +271 -0
  35. data/spec/fixtures/prestashop-1.html +341 -0
  36. data/spec/fixtures/prestashop-2.html +435 -0
  37. data/spec/fixtures/rails-30-1.html +215 -0
  38. data/spec/fixtures/rails-30-2.html +209 -0
  39. data/spec/fixtures/rails-30-3.html +206 -0
  40. data/spec/fixtures/rails-31.html +120 -0
  41. data/spec/fixtures/ubercart-1.html +358 -0
  42. data/spec/fixtures/ubercart-2.html +447 -0
  43. data/spec/framework_guesser_spec.rb +19 -0
  44. data/spec/matcher_spec.rb +227 -0
  45. data/spec/spec_helper.rb +21 -0
  46. data/spec/support/helpers.rb +18 -0
  47. metadata +178 -0
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in framework_guesser.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Martin Lipták
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,45 @@
1
+ # framework_guesser
2
+
3
+ Framework guesser tries to detect frameworks and javascript libraries from HTML code
4
+ and HTTP headers hash. Some extra information like server, server-side programming language, doctype, meta description and keywords are returned as well.
5
+
6
+ It is used by [statscrawler.com][1] to analyze sites and collect statistics about
7
+ Internet domains. This is a sample (though working and pretty usable) for everyone
8
+ interested in framework detection on statscrawler.com.
9
+
10
+ [1]: http://www.statscrawler.com
11
+
12
+ ## Usage
13
+
14
+ Requires nokogiri and rspec for tests.
15
+
16
+ ```ruby
17
+ require 'open-uri'
18
+ require 'openssl'
19
+ require 'framework_guesser'
20
+
21
+ for domain in ['rubyonrails.org', 'drupal.org', 'wordpress.org', 'joomla.org']
22
+ begin
23
+ open("http://www." + domain,
24
+ :read_timeout => 10,
25
+ :ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE) do |file|
26
+
27
+ url = file.base_uri.to_s
28
+
29
+ result = FrameworkGuesser.guess(file.meta, file.read)
30
+ puts "#{domain} => #{url}"
31
+ puts "Description: #{result[:description]}"
32
+ puts "Keywords: #{result[:keywords]}"
33
+ puts "Server: #{result[:server]}"
34
+ puts "Engine: #{result[:engine]}"
35
+ puts "Doctype: #{result[:doctype]}"
36
+ puts "Framework: #{result[:framework]}"
37
+ puts "Features: #{result[:features].join(', ')}"
38
+
39
+ puts
40
+ end
41
+ rescue StandardError => err
42
+ puts "#{domain} => #{err.message}"
43
+ end
44
+ end
45
+ ```
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'framework_guesser/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "framework_guesser"
8
+ gem.version = FrameworkGuesser::VERSION
9
+ gem.authors = ["Martin Lipták"]
10
+ gem.email = ["mliptak@gmail.com"]
11
+ gem.description = %q{Detection of framework and javascript libraries from HTML code.}
12
+ gem.summary = %q{framework_guesser tries to detect frameworks and javascript libraries from HTML code and HTTP headers hash. Some extra information like server, server-side programming language, doctype, meta description and keywords are returned as well.}
13
+ gem.homepage = "https://github.com/martinliptak/framework_guesser"
14
+
15
+ gem.add_dependency "nokogiri"
16
+
17
+ gem.add_development_dependency "rspec"
18
+ gem.add_development_dependency "fakeweb"
19
+
20
+ gem.files = `git ls-files`.split($/)
21
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
22
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
23
+ gem.require_paths = ["lib"]
24
+ end
@@ -0,0 +1,9 @@
1
+ require "framework_guesser/version"
2
+ require "framework_guesser/matcher"
3
+
4
+ module FrameworkGuesser
5
+ def self.guess(headers, body)
6
+ matcher = Matcher.new
7
+ matcher.match(headers, body)
8
+ end
9
+ end
@@ -0,0 +1,327 @@
1
+ require 'nokogiri.rb'
2
+
3
+ module FrameworkGuesser
4
+ class Matcher
5
+ def match(headers, body)
6
+ @headers = headers
7
+ @data = body
8
+ @result = { :features => [] }
9
+ @document = Nokogiri::HTML(@data)
10
+
11
+ description = @document.css("meta[name=description], meta[name=Description]").first
12
+ @result[:description] = description['content'] if description
13
+
14
+ keywords = @document.css("meta[name=keywords], meta[name=keywords]").first
15
+ @result[:keywords] = keywords['content'] if keywords
16
+
17
+ run
18
+ @result[:features].uniq!
19
+ @result
20
+ end
21
+
22
+ private
23
+
24
+ def run
25
+ resources = []
26
+
27
+ # server
28
+ header "server" do |serv|
29
+ case serv
30
+ when /^apache/i
31
+ server :apache
32
+ when /^nginx/i
33
+ server :nginx
34
+ when /^microsoft-iis/i
35
+ server :microsoft_iis
36
+ when /^lighttpd/i
37
+ server :lighttpd
38
+ when /^mongrel/i
39
+ engine :ruby
40
+ server :mongrel
41
+ else
42
+ server serv
43
+ end
44
+ end
45
+
46
+ # engine
47
+ header "x-powered-by" do |x_powered_by|
48
+ case x_powered_by
49
+ when /^php/i
50
+ engine :php
51
+ when /^asp/i
52
+ engine :asp
53
+ when /^nette/i
54
+ framework :nette
55
+ engine :php
56
+ when /^w3 total cache/i
57
+ framework :wordpress
58
+ engine :php
59
+ when /^(servlet|jsp|jsf)/i
60
+ engine :java
61
+ when /^phusion passenger/i
62
+ engine :ruby
63
+ framework :rails
64
+ when "Chuck Norris!"
65
+ engine_final x_powered_by # chuck norris is final
66
+ else
67
+ engine x_powered_by
68
+ end
69
+ end
70
+
71
+ # cookies
72
+ header "set-cookie" do |set_cookie|
73
+ case set_cookie
74
+ when /^PHPSESSID/
75
+ engine :php
76
+ when /^ASP\.NET/
77
+ engine :asp
78
+ when /^JSESSIONID/
79
+ engine :java
80
+ when /^CAKEPHP/
81
+ framework :cakephp
82
+ when /^symfony/
83
+ framework :symfony
84
+ when /^zenid/
85
+ framework :zencart
86
+ end
87
+ end
88
+
89
+ # doctype
90
+ regexp /^\s*(<\?xml[^?]+\?>)?\s*<!doctype\s*(html\s*public\s*".[^"]*"|html)/i do |match|
91
+ doctype match[2].delete("\n\r").strip.squeeze(" ").downcase
92
+ end
93
+
94
+ # powered by opencart
95
+ regexp /<!--[^>]*Powered By OpenCart/ do
96
+ framework :opencart
97
+ end
98
+
99
+ # google adsense
100
+ regexp /<!--\s*google_ad_section_start/ do
101
+ feature :google_adsense
102
+ end
103
+
104
+ # scripts
105
+ element("script") do |script|
106
+ attribute(script, "src") do |src|
107
+ # full path
108
+ case src
109
+ when "/media/system/js/mootools.js"
110
+ framework :joomla
111
+ when "http://www.google-analytics.com/ga.js"
112
+ feature :google_analytics
113
+ when %r{^(http://static\.ak\.fbcdn\.net|http://connect\.facebook\.net)}
114
+ feature :facebook
115
+ when %r{^http://\w{2,3}.search.etargetnet.com}
116
+ feature :etarget
117
+ when "http://pagead2.googlesyndication.com/pagead/show_ads.js"
118
+ feature :google_adsense
119
+ end
120
+
121
+ # filename
122
+ filename(src) do |filename|
123
+ case filename
124
+ when /^jquery/i
125
+ feature :jquery
126
+ when /^jquery-ui/i
127
+ feature :jquery_ui
128
+ when /^prototype/i
129
+ feature :prototype
130
+ when /^scriptaculous/i
131
+ feature :scriptaculous
132
+ when /^mootools/i
133
+ feature :mootools
134
+ when /^drupal/i
135
+ framework :drupal
136
+ end
137
+ end
138
+
139
+ resources << src
140
+ end
141
+
142
+ # script text content
143
+ case script
144
+ when /^.{0,30}jQuery\.extend\(Drupal\.settings/
145
+ framework :drupal
146
+ when /jQuery/
147
+ feature :jquery
148
+ when /['"]UA-[\d-]{5,}["']/
149
+ feature :google_analytics
150
+ when %r{//connect\.facebook\.net}
151
+ feature :facebook
152
+ when %r{var\s*EtargetBannerIdent|http://\w{2,3}.search.etargetnet.com}
153
+ feature :etarget
154
+ end
155
+ end
156
+
157
+ # stylesheets
158
+ element("link[rel=stylesheet]") do |style|
159
+ attribute(style, "href") do |href|
160
+ # full path
161
+ case href
162
+ when /modules\/node\/node\.css/
163
+ framework :drupal
164
+ when %r{/themes/[^/]+/css/global\.css}
165
+ framework :prestashop
166
+ when %r{^catalog/view/theme/[^/]+/stylesheet/stylesheet.css}
167
+ framework :opencart
168
+ end
169
+
170
+ # filename
171
+ filename(href) do |filename|
172
+ case filename
173
+ when /^uc_product/i
174
+ framework_final :ubercart
175
+ end
176
+ end
177
+
178
+ resources << href
179
+ end
180
+ end
181
+
182
+ # inline styles
183
+ element("style") do |style|
184
+ case style
185
+ when /^@import "(\/modules\/node\/node\.css|\/misc\/drupal\.css)";/
186
+ framework :drupal
187
+ end
188
+ end
189
+
190
+ # images
191
+ element("img") do |img|
192
+ attribute(img, 'src') do |src|
193
+ resources << src
194
+ end
195
+ end
196
+
197
+ # iframe
198
+ element("iframe") do |iframe|
199
+ attribute(iframe, "src") do |src|
200
+ case src
201
+ when /^http:\/\/www.facebook.com/
202
+ feature :facebook
203
+ end
204
+ end
205
+ end
206
+
207
+ # image, stylesheet, script paths
208
+ resources.each do |link|
209
+ case link
210
+ when /^\/sites\/[\w\d\-.]+\/(files|themes|modules)/
211
+ framework :drupal
212
+ when /\/(wp-includes\/js|wp-content\/(plugins|themes))\//
213
+ framework :wordpress
214
+ when /com_virtuemart/
215
+ framework_final :virtuemart
216
+ when %r{^/(javascripts|stylesheets|images)/([^/]+/)*[^/]*\.\w{2,4}\?\d{10}$|^/assets/([^/+])*/[^/]*-[a-f0-9]{32}}
217
+ framework :rails
218
+ end
219
+ end
220
+
221
+ # meta name=generator
222
+ generator do |content|
223
+ case content
224
+ when /^joomla/i
225
+ framework :joomla
226
+ when /^typo3/i
227
+ framework :typo3
228
+ when /^wordpress/i
229
+ framework :wordpress
230
+ when /^drupal/i
231
+ framework :drupal
232
+ when /^shopping cart program by Zen Cart/
233
+ framework :zencart
234
+ when /^prestashop/i
235
+ framework :prestashop
236
+ end
237
+ end
238
+
239
+ # rails authenticity token
240
+ element("form input[name=authenticity_token], meta[name=csrf-param], meta[name=csrf-token]") do
241
+ framework :rails
242
+ end
243
+
244
+ # ubercart classes
245
+ element(".uc-price, .uc-price-product, .block-uc_cart") do
246
+ framework_final :ubercart
247
+ end
248
+
249
+ # framework implies engine
250
+ engine :php if framework_in? [:joomla, :wordpress, :drupal, :typo3, :virtuemart, :nette, :ubercart,
251
+ :cakephp, :symfony, :zencart, :prestashop, :opencart]
252
+ engine :ruby if framework_in? [:rails]
253
+ end
254
+
255
+ # HTTP headers
256
+ def header(name)
257
+ yield @headers[name] if @headers[name]
258
+ end
259
+
260
+ # Regexps over raw HTML
261
+ def regexp(exp)
262
+ begin
263
+ match = exp.match(@data)
264
+ yield match if match
265
+ rescue ArgumentError # velmi zriedkava UTF8 chyba
266
+ end
267
+ end
268
+
269
+ # DOM element
270
+ def element(path)
271
+ @document.css(path).each do |script|
272
+ yield script
273
+ end
274
+ end
275
+
276
+ # DOM attribute
277
+ def attribute(script, attr)
278
+ yield script[attr] if script[attr]
279
+ end
280
+
281
+ # filename from path
282
+ def filename(path)
283
+ yield $2 if /(\/|^)([^\/]*$)/i.match(path)
284
+ end
285
+
286
+ # meta name=generator
287
+ def generator
288
+ generator = @document.css("meta[name=generator], meta[name=Generator]").first
289
+ yield generator["content"] if (generator and generator["content"])
290
+ end
291
+
292
+ def framework_in?(frameworks)
293
+ frameworks.index(@result[:framework])
294
+ end
295
+
296
+ # Feature's attributes
297
+ def server(val)
298
+ @result[:server] = val
299
+ end
300
+
301
+ def engine(val)
302
+ @result[:engine] = val unless @engine_final
303
+ end
304
+
305
+ def engine_final(val)
306
+ @result[:engine] = val
307
+ @engine_final = true
308
+ end
309
+
310
+ def doctype(val)
311
+ @result[:doctype] = val
312
+ end
313
+
314
+ def framework(val)
315
+ @result[:framework] = val unless @final
316
+ end
317
+
318
+ def framework_final(val)
319
+ @result[:framework] = val
320
+ @final = true
321
+ end
322
+
323
+ def feature(val)
324
+ @result[:features] << val
325
+ end
326
+ end
327
+ end