language_sniffer 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/language_sniffer +21 -0
- data/lib/language_sniffer/blob_helper.rb +389 -0
- data/lib/language_sniffer/file_blob.rb +43 -0
- data/lib/language_sniffer/language.rb +345 -0
- data/lib/language_sniffer/languages.yml +1055 -0
- data/lib/language_sniffer/pathname.rb +71 -0
- data/lib/language_sniffer/version.rb +8 -0
- data/lib/language_sniffer.rb +10 -0
- metadata +88 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'language_sniffer/file_blob'
|
4
|
+
|
5
|
+
path = ARGV[0] || ''
|
6
|
+
|
7
|
+
if File.directory?(path)
|
8
|
+
abort "Cannot parse a whole directory"
|
9
|
+
elsif File.file?(path)
|
10
|
+
blob = LanguageSniffer::FileBlob.new(path, Dir.pwd)
|
11
|
+
|
12
|
+
puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
|
13
|
+
puts " extension: #{blob.pathname.extname}"
|
14
|
+
puts " language: #{blob.language}"
|
15
|
+
|
16
|
+
if blob.generated?
|
17
|
+
puts " appears to be generated source code"
|
18
|
+
end
|
19
|
+
else
|
20
|
+
abort "usage: language_sniffer <file>"
|
21
|
+
end
|
@@ -0,0 +1,389 @@
|
|
1
|
+
require 'language_sniffer/language'
|
2
|
+
require 'language_sniffer/pathname'
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
module LanguageSniffer
|
6
|
+
# BlobHelper is a mixin for Blobish classes that respond to "name",
|
7
|
+
# "data" and "size" such as Grit::Blob.
|
8
|
+
module BlobHelper
|
9
|
+
# Internal: Get a Pathname wrapper for Blob#name
|
10
|
+
#
|
11
|
+
# Returns a Pathname.
|
12
|
+
def pathname
|
13
|
+
Pathname.new(name || "")
|
14
|
+
end
|
15
|
+
|
16
|
+
# Public: Get the extname of the path
|
17
|
+
#
|
18
|
+
# Examples
|
19
|
+
#
|
20
|
+
# blob(name='foo.rb').extname
|
21
|
+
# # => '.rb'
|
22
|
+
#
|
23
|
+
# Returns a String
|
24
|
+
def extname
|
25
|
+
pathname.extname
|
26
|
+
end
|
27
|
+
|
28
|
+
# Public: Get each line of data
|
29
|
+
#
|
30
|
+
# Requires Blob#data
|
31
|
+
#
|
32
|
+
# Returns an Array of lines
|
33
|
+
def lines
|
34
|
+
@lines ||= (data ? data.split("\n", -1) : [])
|
35
|
+
end
|
36
|
+
|
37
|
+
# Public: Get number of lines of code
|
38
|
+
#
|
39
|
+
# Requires Blob#data
|
40
|
+
#
|
41
|
+
# Returns Integer
|
42
|
+
def loc
|
43
|
+
lines.size
|
44
|
+
end
|
45
|
+
|
46
|
+
# Public: Get number of source lines of code
|
47
|
+
#
|
48
|
+
# Requires Blob#data
|
49
|
+
#
|
50
|
+
# Returns Integer
|
51
|
+
def sloc
|
52
|
+
lines.grep(/\S/).size
|
53
|
+
end
|
54
|
+
|
55
|
+
# Internal: Compute average line length.
|
56
|
+
#
|
57
|
+
# Returns Integer.
|
58
|
+
def average_line_length
|
59
|
+
if lines.any?
|
60
|
+
lines.inject(0) { |n, l| n += l.length } / lines.length
|
61
|
+
else
|
62
|
+
0
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Public: Is the blob a generated file?
|
67
|
+
#
|
68
|
+
# Generated source code is supressed in diffs and is ignored by
|
69
|
+
# language statistics.
|
70
|
+
#
|
71
|
+
# Requires Blob#data
|
72
|
+
#
|
73
|
+
# Includes:
|
74
|
+
# - XCode project XML files
|
75
|
+
# - Visual Studio project XNL files
|
76
|
+
# - Minified JavaScript
|
77
|
+
#
|
78
|
+
# Please add additional test coverage to
|
79
|
+
# `test/test_blob.rb#test_generated` if you make any changes.
|
80
|
+
#
|
81
|
+
# Return true or false
|
82
|
+
def generated?
|
83
|
+
if xcode_project_file? || visual_studio_project_file?
|
84
|
+
true
|
85
|
+
elsif generated_coffeescript? || minified_javascript? || generated_net_docfile?
|
86
|
+
true
|
87
|
+
else
|
88
|
+
false
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Internal: Is the blob an XCode project file?
|
93
|
+
#
|
94
|
+
# Generated if the file extension is an XCode project
|
95
|
+
# file extension.
|
96
|
+
#
|
97
|
+
# Returns true of false.
|
98
|
+
def xcode_project_file?
|
99
|
+
['.xib', '.nib', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
|
100
|
+
end
|
101
|
+
|
102
|
+
# Internal: Is the blob a Visual Studio project file?
|
103
|
+
#
|
104
|
+
# Generated if the file extension is a Visual Studio project
|
105
|
+
# file extension.
|
106
|
+
#
|
107
|
+
# Returns true of false.
|
108
|
+
def visual_studio_project_file?
|
109
|
+
['.csproj', '.dbproj', '.fsproj', '.pyproj', '.rbproj', '.vbproj', '.vcxproj', '.wixproj', '.resx', '.sln', '.vdproj', '.isproj'].include?(extname)
|
110
|
+
end
|
111
|
+
|
112
|
+
# Internal: Is the blob minified JS?
|
113
|
+
#
|
114
|
+
# Consider JS minified if the average line length is
|
115
|
+
# greater then 100c.
|
116
|
+
#
|
117
|
+
# Returns true or false.
|
118
|
+
def minified_javascript?
|
119
|
+
return unless extname == '.js'
|
120
|
+
average_line_length > 100
|
121
|
+
end
|
122
|
+
|
123
|
+
# Internal: Is the blob JS generated by CoffeeScript?
|
124
|
+
#
|
125
|
+
# Requires Blob#data
|
126
|
+
#
|
127
|
+
# CoffeScript is meant to output JS that would be difficult to
|
128
|
+
# tell if it was generated or not. Look for a number of patterns
|
129
|
+
# outputed by the CS compiler.
|
130
|
+
#
|
131
|
+
# Return true or false
|
132
|
+
def generated_coffeescript?
|
133
|
+
return unless extname == '.js'
|
134
|
+
|
135
|
+
if lines[0] == '(function() {' && # First line is module closure opening
|
136
|
+
lines[-2] == '}).call(this);' && # Second to last line closes module closure
|
137
|
+
lines[-1] == '' # Last line is blank
|
138
|
+
|
139
|
+
score = 0
|
140
|
+
|
141
|
+
lines.each do |line|
|
142
|
+
if line =~ /var /
|
143
|
+
# Underscored temp vars are likely to be Coffee
|
144
|
+
score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
|
145
|
+
|
146
|
+
# bind and extend functions are very Coffee specific
|
147
|
+
score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
# Require a score of 3. This is fairly arbitrary. Consider
|
152
|
+
# tweaking later.
|
153
|
+
score >= 3
|
154
|
+
else
|
155
|
+
false
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
# Internal: Is this a generated documentation file for a .NET assembly?
|
160
|
+
#
|
161
|
+
# Requires Blob#data
|
162
|
+
#
|
163
|
+
# .NET developers often check in the XML Intellisense file along with an
|
164
|
+
# assembly - however, these don't have a special extension, so we have to
|
165
|
+
# dig into the contents to determine if it's a docfile. Luckily, these files
|
166
|
+
# are extremely structured, so recognizing them is easy.
|
167
|
+
#
|
168
|
+
# Returns true or false
|
169
|
+
def generated_net_docfile?
|
170
|
+
return false unless extname.downcase == ".xml"
|
171
|
+
return false unless lines.count > 3
|
172
|
+
|
173
|
+
# .NET Docfiles always open with <doc> and their first tag is an
|
174
|
+
# <assembly> tag
|
175
|
+
return lines[1].include?("<doc>") &&
|
176
|
+
lines[2].include?("<assembly>") &&
|
177
|
+
lines[-2].include?("</doc>")
|
178
|
+
end
|
179
|
+
|
180
|
+
# Public: Detects the Language of the blob.
|
181
|
+
#
|
182
|
+
# May load Blob#data
|
183
|
+
#
|
184
|
+
# Returns a Language or nil if none is detected
|
185
|
+
def language
|
186
|
+
if defined? @language
|
187
|
+
@language
|
188
|
+
else
|
189
|
+
@language = guess_language
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
# Internal: Guess language
|
194
|
+
#
|
195
|
+
# Please add additional test coverage to
|
196
|
+
# `test/test_blob.rb#test_language` if you make any changes.
|
197
|
+
#
|
198
|
+
# Returns a Language or nil
|
199
|
+
def guess_language
|
200
|
+
# Disambiguate between multiple language extensions
|
201
|
+
disambiguate_extension_language ||
|
202
|
+
|
203
|
+
# See if there is a Language for the extension
|
204
|
+
pathname.language ||
|
205
|
+
|
206
|
+
# Look for idioms in first line
|
207
|
+
first_line_language ||
|
208
|
+
|
209
|
+
# Try to detect Language from shebang line
|
210
|
+
shebang_language
|
211
|
+
end
|
212
|
+
|
213
|
+
# Internal: Disambiguates between multiple language extensions.
|
214
|
+
#
|
215
|
+
# Delegates to "guess_EXTENSION_language".
|
216
|
+
#
|
217
|
+
# Please add additional test coverage to
|
218
|
+
# `test/test_blob.rb#test_language` if you add another method.
|
219
|
+
#
|
220
|
+
# Returns a Language or nil.
|
221
|
+
def disambiguate_extension_language
|
222
|
+
if Language.ambiguous?(extname)
|
223
|
+
name = "guess_#{extname.sub(/^\./, '')}_language"
|
224
|
+
send(name) if respond_to?(name)
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
# Internal: Guess language of header files (.h).
|
229
|
+
#
|
230
|
+
# Returns a Language.
|
231
|
+
def guess_h_language
|
232
|
+
if lines.grep(/^@(interface|property|private|public|end)/).any?
|
233
|
+
Language['Objective-C']
|
234
|
+
elsif lines.grep(/^class |^\s+(public|protected|private):/).any?
|
235
|
+
Language['C++']
|
236
|
+
else
|
237
|
+
Language['C']
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
# Internal: Guess language of .m files.
|
242
|
+
#
|
243
|
+
# Objective-C heuristics:
|
244
|
+
# * Keywords
|
245
|
+
#
|
246
|
+
# Matlab heuristics:
|
247
|
+
# * Leading function keyword
|
248
|
+
# * "%" comments
|
249
|
+
#
|
250
|
+
# Returns a Language.
|
251
|
+
def guess_m_language
|
252
|
+
# Objective-C keywords
|
253
|
+
if lines.grep(/^#import|@(interface|implementation|property|synthesize|end)/).any?
|
254
|
+
Language['Objective-C']
|
255
|
+
|
256
|
+
# File function
|
257
|
+
elsif lines.first.to_s =~ /^function /
|
258
|
+
Language['Matlab']
|
259
|
+
|
260
|
+
# Matlab comment
|
261
|
+
elsif lines.grep(/^%/).any?
|
262
|
+
Language['Matlab']
|
263
|
+
|
264
|
+
# Fallback to Objective-C, don't want any Matlab false positives
|
265
|
+
else
|
266
|
+
Language['Objective-C']
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# Internal: Guess language of .pl files
|
271
|
+
#
|
272
|
+
# The rules for disambiguation are:
|
273
|
+
#
|
274
|
+
# 1. Many perl files begin with a shebang
|
275
|
+
# 2. Most Prolog source files have a rule somewhere (marked by the :- operator)
|
276
|
+
# 3. Default to Perl, because it is more popular
|
277
|
+
#
|
278
|
+
# Returns a Language.
|
279
|
+
def guess_pl_language
|
280
|
+
if shebang_script == 'perl'
|
281
|
+
Language['Perl']
|
282
|
+
elsif lines.grep(/:-/).any?
|
283
|
+
Language['Prolog']
|
284
|
+
else
|
285
|
+
Language['Perl']
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
# Internal: Guess language of .r files.
|
290
|
+
#
|
291
|
+
# Returns a Language.
|
292
|
+
def guess_r_language
|
293
|
+
if lines.grep(/(rebol|(:\s+func|make\s+object!|^\s*context)\s*\[)/i).any?
|
294
|
+
Language['Rebol']
|
295
|
+
else
|
296
|
+
Language['R']
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
# Internal: Guess language of .gsp files.
|
301
|
+
#
|
302
|
+
# Returns a Language.
|
303
|
+
def guess_gsp_language
|
304
|
+
if lines.grep(/<%|<%@|\$\{|<%|<g:|<meta name="layout"|<r:/).any?
|
305
|
+
Language['Groovy Server Pages']
|
306
|
+
else
|
307
|
+
Language['Gosu']
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
# Internal: Guess language from the first line.
|
312
|
+
#
|
313
|
+
# Look for leading "<?php"
|
314
|
+
#
|
315
|
+
# Returns a Language.
|
316
|
+
def first_line_language
|
317
|
+
if lines.first.to_s =~ /^<\?php/
|
318
|
+
Language['PHP']
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
# Internal: Extract the script name from the shebang line
|
323
|
+
#
|
324
|
+
# Requires Blob#data
|
325
|
+
#
|
326
|
+
# Examples
|
327
|
+
#
|
328
|
+
# '#!/usr/bin/ruby'
|
329
|
+
# # => 'ruby'
|
330
|
+
#
|
331
|
+
# '#!/usr/bin/env ruby'
|
332
|
+
# # => 'ruby'
|
333
|
+
#
|
334
|
+
# '#!/usr/bash/python2.4'
|
335
|
+
# # => 'python'
|
336
|
+
#
|
337
|
+
# Please add additional test coverage to
|
338
|
+
# `test/test_blob.rb#test_shebang_script` if you make any changes.
|
339
|
+
#
|
340
|
+
# Returns a script name String or nil
|
341
|
+
def shebang_script
|
342
|
+
if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
|
343
|
+
bang.sub!(/^#! /, '#!')
|
344
|
+
tokens = bang.split(' ')
|
345
|
+
pieces = tokens.first.split('/')
|
346
|
+
if pieces.size > 1
|
347
|
+
script = pieces.last
|
348
|
+
else
|
349
|
+
script = pieces.first.sub('#!', '')
|
350
|
+
end
|
351
|
+
|
352
|
+
script = script == 'env' ? tokens[1] : script
|
353
|
+
|
354
|
+
# python2.4 => python
|
355
|
+
if script =~ /((?:\d+\.?)+)/
|
356
|
+
script.sub! $1, ''
|
357
|
+
end
|
358
|
+
|
359
|
+
# Check for multiline shebang hacks that exec themselves
|
360
|
+
#
|
361
|
+
# #!/bin/sh
|
362
|
+
# exec foo "$0" "$@"
|
363
|
+
#
|
364
|
+
if script == 'sh' &&
|
365
|
+
lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
|
366
|
+
script = $1
|
367
|
+
end
|
368
|
+
|
369
|
+
script
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
373
|
+
# Internal: Get Language for shebang script
|
374
|
+
#
|
375
|
+
# Returns the Language or nil
|
376
|
+
def shebang_language
|
377
|
+
if script = shebang_script
|
378
|
+
Language[script]
|
379
|
+
end
|
380
|
+
end
|
381
|
+
|
382
|
+
Language.overridden_extensions.each do |extension|
|
383
|
+
name = "guess_#{extension.sub(/^\./, '')}_language".to_sym
|
384
|
+
unless instance_methods.map(&:to_sym).include?(name)
|
385
|
+
warn "Language##{name} was not defined"
|
386
|
+
end
|
387
|
+
end
|
388
|
+
end
|
389
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'language_sniffer/blob_helper'
|
2
|
+
|
3
|
+
module LanguageSniffer
|
4
|
+
# A FileBlob is a wrapper around a File object to make it quack
|
5
|
+
# like a Grit::Blob. It provides the basic interface: `name`,
|
6
|
+
# `data`, and `size`.
|
7
|
+
class FileBlob
|
8
|
+
include BlobHelper
|
9
|
+
|
10
|
+
# Public: Initialize a new FileBlob from a path
|
11
|
+
#
|
12
|
+
# path - A path String that exists on the file system.
|
13
|
+
# base_path - Optional base to relativize the path
|
14
|
+
#
|
15
|
+
# Returns a FileBlob.
|
16
|
+
def initialize(path, base_path = nil, data=nil)
|
17
|
+
@path = path
|
18
|
+
@name = base_path ? path.sub("#{base_path}/", '') : path
|
19
|
+
@data = data
|
20
|
+
end
|
21
|
+
|
22
|
+
# Public: Filename
|
23
|
+
#
|
24
|
+
# Examples
|
25
|
+
#
|
26
|
+
# FileBlob.new("/path/to/language_sniffer/lib/language_sniffer.rb").name
|
27
|
+
# # => "/path/to/language_sniffer/lib/language_sniffer.rb"
|
28
|
+
#
|
29
|
+
# FileBlob.new("/path/to/language_sniffer/lib/language_sniffer.rb",
|
30
|
+
# "/path/to/language_sniffer").name
|
31
|
+
# # => "lib/language_sniffer.rb"
|
32
|
+
#
|
33
|
+
# Returns a String
|
34
|
+
attr_reader :name
|
35
|
+
|
36
|
+
# Public: Read file contents.
|
37
|
+
#
|
38
|
+
# Returns a String.
|
39
|
+
def data
|
40
|
+
@data ||= File.read(@path)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|