language_sniffer 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/language_sniffer +21 -0
- data/lib/language_sniffer/blob_helper.rb +389 -0
- data/lib/language_sniffer/file_blob.rb +43 -0
- data/lib/language_sniffer/language.rb +345 -0
- data/lib/language_sniffer/languages.yml +1055 -0
- data/lib/language_sniffer/pathname.rb +71 -0
- data/lib/language_sniffer/version.rb +8 -0
- data/lib/language_sniffer.rb +10 -0
- metadata +88 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'language_sniffer/file_blob'
|
4
|
+
|
5
|
+
path = ARGV[0] || ''
|
6
|
+
|
7
|
+
if File.directory?(path)
|
8
|
+
abort "Cannot parse a whole directory"
|
9
|
+
elsif File.file?(path)
|
10
|
+
blob = LanguageSniffer::FileBlob.new(path, Dir.pwd)
|
11
|
+
|
12
|
+
puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
|
13
|
+
puts " extension: #{blob.pathname.extname}"
|
14
|
+
puts " language: #{blob.language}"
|
15
|
+
|
16
|
+
if blob.generated?
|
17
|
+
puts " appears to be generated source code"
|
18
|
+
end
|
19
|
+
else
|
20
|
+
abort "usage: language_sniffer <file>"
|
21
|
+
end
|
@@ -0,0 +1,389 @@
|
|
1
|
+
require 'language_sniffer/language'
|
2
|
+
require 'language_sniffer/pathname'
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
module LanguageSniffer
|
6
|
+
# BlobHelper is a mixin for Blobish classes that respond to "name",
|
7
|
+
# "data" and "size" such as Grit::Blob.
|
8
|
+
module BlobHelper
|
9
|
+
# Internal: Get a Pathname wrapper for Blob#name
|
10
|
+
#
|
11
|
+
# Returns a Pathname.
|
12
|
+
def pathname
|
13
|
+
Pathname.new(name || "")
|
14
|
+
end
|
15
|
+
|
16
|
+
# Public: Get the extname of the path
|
17
|
+
#
|
18
|
+
# Examples
|
19
|
+
#
|
20
|
+
# blob(name='foo.rb').extname
|
21
|
+
# # => '.rb'
|
22
|
+
#
|
23
|
+
# Returns a String
|
24
|
+
def extname
|
25
|
+
pathname.extname
|
26
|
+
end
|
27
|
+
|
28
|
+
# Public: Get each line of data
|
29
|
+
#
|
30
|
+
# Requires Blob#data
|
31
|
+
#
|
32
|
+
# Returns an Array of lines
|
33
|
+
def lines
|
34
|
+
@lines ||= (data ? data.split("\n", -1) : [])
|
35
|
+
end
|
36
|
+
|
37
|
+
# Public: Get number of lines of code
|
38
|
+
#
|
39
|
+
# Requires Blob#data
|
40
|
+
#
|
41
|
+
# Returns Integer
|
42
|
+
def loc
|
43
|
+
lines.size
|
44
|
+
end
|
45
|
+
|
46
|
+
# Public: Get number of source lines of code
|
47
|
+
#
|
48
|
+
# Requires Blob#data
|
49
|
+
#
|
50
|
+
# Returns Integer
|
51
|
+
def sloc
|
52
|
+
lines.grep(/\S/).size
|
53
|
+
end
|
54
|
+
|
55
|
+
# Internal: Compute average line length.
|
56
|
+
#
|
57
|
+
# Returns Integer.
|
58
|
+
def average_line_length
|
59
|
+
if lines.any?
|
60
|
+
lines.inject(0) { |n, l| n += l.length } / lines.length
|
61
|
+
else
|
62
|
+
0
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Public: Is the blob a generated file?
|
67
|
+
#
|
68
|
+
# Generated source code is supressed in diffs and is ignored by
|
69
|
+
# language statistics.
|
70
|
+
#
|
71
|
+
# Requires Blob#data
|
72
|
+
#
|
73
|
+
# Includes:
|
74
|
+
# - XCode project XML files
|
75
|
+
# - Visual Studio project XNL files
|
76
|
+
# - Minified JavaScript
|
77
|
+
#
|
78
|
+
# Please add additional test coverage to
|
79
|
+
# `test/test_blob.rb#test_generated` if you make any changes.
|
80
|
+
#
|
81
|
+
# Return true or false
|
82
|
+
def generated?
|
83
|
+
if xcode_project_file? || visual_studio_project_file?
|
84
|
+
true
|
85
|
+
elsif generated_coffeescript? || minified_javascript? || generated_net_docfile?
|
86
|
+
true
|
87
|
+
else
|
88
|
+
false
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Internal: Is the blob an XCode project file?
|
93
|
+
#
|
94
|
+
# Generated if the file extension is an XCode project
|
95
|
+
# file extension.
|
96
|
+
#
|
97
|
+
# Returns true of false.
|
98
|
+
def xcode_project_file?
|
99
|
+
['.xib', '.nib', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
|
100
|
+
end
|
101
|
+
|
102
|
+
# Internal: Is the blob a Visual Studio project file?
|
103
|
+
#
|
104
|
+
# Generated if the file extension is a Visual Studio project
|
105
|
+
# file extension.
|
106
|
+
#
|
107
|
+
# Returns true of false.
|
108
|
+
def visual_studio_project_file?
|
109
|
+
['.csproj', '.dbproj', '.fsproj', '.pyproj', '.rbproj', '.vbproj', '.vcxproj', '.wixproj', '.resx', '.sln', '.vdproj', '.isproj'].include?(extname)
|
110
|
+
end
|
111
|
+
|
112
|
+
# Internal: Is the blob minified JS?
|
113
|
+
#
|
114
|
+
# Consider JS minified if the average line length is
|
115
|
+
# greater then 100c.
|
116
|
+
#
|
117
|
+
# Returns true or false.
|
118
|
+
def minified_javascript?
|
119
|
+
return unless extname == '.js'
|
120
|
+
average_line_length > 100
|
121
|
+
end
|
122
|
+
|
123
|
+
# Internal: Is the blob JS generated by CoffeeScript?
|
124
|
+
#
|
125
|
+
# Requires Blob#data
|
126
|
+
#
|
127
|
+
# CoffeScript is meant to output JS that would be difficult to
|
128
|
+
# tell if it was generated or not. Look for a number of patterns
|
129
|
+
# outputed by the CS compiler.
|
130
|
+
#
|
131
|
+
# Return true or false
|
132
|
+
def generated_coffeescript?
|
133
|
+
return unless extname == '.js'
|
134
|
+
|
135
|
+
if lines[0] == '(function() {' && # First line is module closure opening
|
136
|
+
lines[-2] == '}).call(this);' && # Second to last line closes module closure
|
137
|
+
lines[-1] == '' # Last line is blank
|
138
|
+
|
139
|
+
score = 0
|
140
|
+
|
141
|
+
lines.each do |line|
|
142
|
+
if line =~ /var /
|
143
|
+
# Underscored temp vars are likely to be Coffee
|
144
|
+
score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
|
145
|
+
|
146
|
+
# bind and extend functions are very Coffee specific
|
147
|
+
score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
# Require a score of 3. This is fairly arbitrary. Consider
|
152
|
+
# tweaking later.
|
153
|
+
score >= 3
|
154
|
+
else
|
155
|
+
false
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
# Internal: Is this a generated documentation file for a .NET assembly?
|
160
|
+
#
|
161
|
+
# Requires Blob#data
|
162
|
+
#
|
163
|
+
# .NET developers often check in the XML Intellisense file along with an
|
164
|
+
# assembly - however, these don't have a special extension, so we have to
|
165
|
+
# dig into the contents to determine if it's a docfile. Luckily, these files
|
166
|
+
# are extremely structured, so recognizing them is easy.
|
167
|
+
#
|
168
|
+
# Returns true or false
|
169
|
+
def generated_net_docfile?
|
170
|
+
return false unless extname.downcase == ".xml"
|
171
|
+
return false unless lines.count > 3
|
172
|
+
|
173
|
+
# .NET Docfiles always open with <doc> and their first tag is an
|
174
|
+
# <assembly> tag
|
175
|
+
return lines[1].include?("<doc>") &&
|
176
|
+
lines[2].include?("<assembly>") &&
|
177
|
+
lines[-2].include?("</doc>")
|
178
|
+
end
|
179
|
+
|
180
|
+
# Public: Detects the Language of the blob.
|
181
|
+
#
|
182
|
+
# May load Blob#data
|
183
|
+
#
|
184
|
+
# Returns a Language or nil if none is detected
|
185
|
+
def language
|
186
|
+
if defined? @language
|
187
|
+
@language
|
188
|
+
else
|
189
|
+
@language = guess_language
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
# Internal: Guess language
|
194
|
+
#
|
195
|
+
# Please add additional test coverage to
|
196
|
+
# `test/test_blob.rb#test_language` if you make any changes.
|
197
|
+
#
|
198
|
+
# Returns a Language or nil
|
199
|
+
def guess_language
|
200
|
+
# Disambiguate between multiple language extensions
|
201
|
+
disambiguate_extension_language ||
|
202
|
+
|
203
|
+
# See if there is a Language for the extension
|
204
|
+
pathname.language ||
|
205
|
+
|
206
|
+
# Look for idioms in first line
|
207
|
+
first_line_language ||
|
208
|
+
|
209
|
+
# Try to detect Language from shebang line
|
210
|
+
shebang_language
|
211
|
+
end
|
212
|
+
|
213
|
+
# Internal: Disambiguates between multiple language extensions.
|
214
|
+
#
|
215
|
+
# Delegates to "guess_EXTENSION_language".
|
216
|
+
#
|
217
|
+
# Please add additional test coverage to
|
218
|
+
# `test/test_blob.rb#test_language` if you add another method.
|
219
|
+
#
|
220
|
+
# Returns a Language or nil.
|
221
|
+
def disambiguate_extension_language
|
222
|
+
if Language.ambiguous?(extname)
|
223
|
+
name = "guess_#{extname.sub(/^\./, '')}_language"
|
224
|
+
send(name) if respond_to?(name)
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
# Internal: Guess language of header files (.h).
|
229
|
+
#
|
230
|
+
# Returns a Language.
|
231
|
+
def guess_h_language
|
232
|
+
if lines.grep(/^@(interface|property|private|public|end)/).any?
|
233
|
+
Language['Objective-C']
|
234
|
+
elsif lines.grep(/^class |^\s+(public|protected|private):/).any?
|
235
|
+
Language['C++']
|
236
|
+
else
|
237
|
+
Language['C']
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
# Internal: Guess language of .m files.
|
242
|
+
#
|
243
|
+
# Objective-C heuristics:
|
244
|
+
# * Keywords
|
245
|
+
#
|
246
|
+
# Matlab heuristics:
|
247
|
+
# * Leading function keyword
|
248
|
+
# * "%" comments
|
249
|
+
#
|
250
|
+
# Returns a Language.
|
251
|
+
def guess_m_language
|
252
|
+
# Objective-C keywords
|
253
|
+
if lines.grep(/^#import|@(interface|implementation|property|synthesize|end)/).any?
|
254
|
+
Language['Objective-C']
|
255
|
+
|
256
|
+
# File function
|
257
|
+
elsif lines.first.to_s =~ /^function /
|
258
|
+
Language['Matlab']
|
259
|
+
|
260
|
+
# Matlab comment
|
261
|
+
elsif lines.grep(/^%/).any?
|
262
|
+
Language['Matlab']
|
263
|
+
|
264
|
+
# Fallback to Objective-C, don't want any Matlab false positives
|
265
|
+
else
|
266
|
+
Language['Objective-C']
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# Internal: Guess language of .pl files
|
271
|
+
#
|
272
|
+
# The rules for disambiguation are:
|
273
|
+
#
|
274
|
+
# 1. Many perl files begin with a shebang
|
275
|
+
# 2. Most Prolog source files have a rule somewhere (marked by the :- operator)
|
276
|
+
# 3. Default to Perl, because it is more popular
|
277
|
+
#
|
278
|
+
# Returns a Language.
|
279
|
+
def guess_pl_language
|
280
|
+
if shebang_script == 'perl'
|
281
|
+
Language['Perl']
|
282
|
+
elsif lines.grep(/:-/).any?
|
283
|
+
Language['Prolog']
|
284
|
+
else
|
285
|
+
Language['Perl']
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
# Internal: Guess language of .r files.
|
290
|
+
#
|
291
|
+
# Returns a Language.
|
292
|
+
def guess_r_language
|
293
|
+
if lines.grep(/(rebol|(:\s+func|make\s+object!|^\s*context)\s*\[)/i).any?
|
294
|
+
Language['Rebol']
|
295
|
+
else
|
296
|
+
Language['R']
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
# Internal: Guess language of .gsp files.
|
301
|
+
#
|
302
|
+
# Returns a Language.
|
303
|
+
def guess_gsp_language
|
304
|
+
if lines.grep(/<%|<%@|\$\{|<%|<g:|<meta name="layout"|<r:/).any?
|
305
|
+
Language['Groovy Server Pages']
|
306
|
+
else
|
307
|
+
Language['Gosu']
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
# Internal: Guess language from the first line.
|
312
|
+
#
|
313
|
+
# Look for leading "<?php"
|
314
|
+
#
|
315
|
+
# Returns a Language.
|
316
|
+
def first_line_language
|
317
|
+
if lines.first.to_s =~ /^<\?php/
|
318
|
+
Language['PHP']
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
# Internal: Extract the script name from the shebang line
|
323
|
+
#
|
324
|
+
# Requires Blob#data
|
325
|
+
#
|
326
|
+
# Examples
|
327
|
+
#
|
328
|
+
# '#!/usr/bin/ruby'
|
329
|
+
# # => 'ruby'
|
330
|
+
#
|
331
|
+
# '#!/usr/bin/env ruby'
|
332
|
+
# # => 'ruby'
|
333
|
+
#
|
334
|
+
# '#!/usr/bash/python2.4'
|
335
|
+
# # => 'python'
|
336
|
+
#
|
337
|
+
# Please add additional test coverage to
|
338
|
+
# `test/test_blob.rb#test_shebang_script` if you make any changes.
|
339
|
+
#
|
340
|
+
# Returns a script name String or nil
|
341
|
+
def shebang_script
|
342
|
+
if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
|
343
|
+
bang.sub!(/^#! /, '#!')
|
344
|
+
tokens = bang.split(' ')
|
345
|
+
pieces = tokens.first.split('/')
|
346
|
+
if pieces.size > 1
|
347
|
+
script = pieces.last
|
348
|
+
else
|
349
|
+
script = pieces.first.sub('#!', '')
|
350
|
+
end
|
351
|
+
|
352
|
+
script = script == 'env' ? tokens[1] : script
|
353
|
+
|
354
|
+
# python2.4 => python
|
355
|
+
if script =~ /((?:\d+\.?)+)/
|
356
|
+
script.sub! $1, ''
|
357
|
+
end
|
358
|
+
|
359
|
+
# Check for multiline shebang hacks that exec themselves
|
360
|
+
#
|
361
|
+
# #!/bin/sh
|
362
|
+
# exec foo "$0" "$@"
|
363
|
+
#
|
364
|
+
if script == 'sh' &&
|
365
|
+
lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
|
366
|
+
script = $1
|
367
|
+
end
|
368
|
+
|
369
|
+
script
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
373
|
+
# Internal: Get Language for shebang script
|
374
|
+
#
|
375
|
+
# Returns the Language or nil
|
376
|
+
def shebang_language
|
377
|
+
if script = shebang_script
|
378
|
+
Language[script]
|
379
|
+
end
|
380
|
+
end
|
381
|
+
|
382
|
+
Language.overridden_extensions.each do |extension|
|
383
|
+
name = "guess_#{extension.sub(/^\./, '')}_language".to_sym
|
384
|
+
unless instance_methods.map(&:to_sym).include?(name)
|
385
|
+
warn "Language##{name} was not defined"
|
386
|
+
end
|
387
|
+
end
|
388
|
+
end
|
389
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'language_sniffer/blob_helper'
|
2
|
+
|
3
|
+
module LanguageSniffer
|
4
|
+
# A FileBlob is a wrapper around a File object to make it quack
|
5
|
+
# like a Grit::Blob. It provides the basic interface: `name`,
|
6
|
+
# `data`, and `size`.
|
7
|
+
class FileBlob
|
8
|
+
include BlobHelper
|
9
|
+
|
10
|
+
# Public: Initialize a new FileBlob from a path
|
11
|
+
#
|
12
|
+
# path - A path String that exists on the file system.
|
13
|
+
# base_path - Optional base to relativize the path
|
14
|
+
#
|
15
|
+
# Returns a FileBlob.
|
16
|
+
def initialize(path, base_path = nil, data=nil)
|
17
|
+
@path = path
|
18
|
+
@name = base_path ? path.sub("#{base_path}/", '') : path
|
19
|
+
@data = data
|
20
|
+
end
|
21
|
+
|
22
|
+
# Public: Filename
|
23
|
+
#
|
24
|
+
# Examples
|
25
|
+
#
|
26
|
+
# FileBlob.new("/path/to/language_sniffer/lib/language_sniffer.rb").name
|
27
|
+
# # => "/path/to/language_sniffer/lib/language_sniffer.rb"
|
28
|
+
#
|
29
|
+
# FileBlob.new("/path/to/language_sniffer/lib/language_sniffer.rb",
|
30
|
+
# "/path/to/language_sniffer").name
|
31
|
+
# # => "lib/language_sniffer.rb"
|
32
|
+
#
|
33
|
+
# Returns a String
|
34
|
+
attr_reader :name
|
35
|
+
|
36
|
+
# Public: Read file contents.
|
37
|
+
#
|
38
|
+
# Returns a String.
|
39
|
+
def data
|
40
|
+
@data ||= File.read(@path)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|