language_sniffer 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'language_sniffer/file_blob'
4
+
5
+ path = ARGV[0] || ''
6
+
7
+ if File.directory?(path)
8
+ abort "Cannot parse a whole directory"
9
+ elsif File.file?(path)
10
+ blob = LanguageSniffer::FileBlob.new(path, Dir.pwd)
11
+
12
+ puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
13
+ puts " extension: #{blob.pathname.extname}"
14
+ puts " language: #{blob.language}"
15
+
16
+ if blob.generated?
17
+ puts " appears to be generated source code"
18
+ end
19
+ else
20
+ abort "usage: language_sniffer <file>"
21
+ end
@@ -0,0 +1,389 @@
1
+ require 'language_sniffer/language'
2
+ require 'language_sniffer/pathname'
3
+ require 'yaml'
4
+
5
+ module LanguageSniffer
6
+ # BlobHelper is a mixin for Blobish classes that respond to "name",
7
+ # "data" and "size" such as Grit::Blob.
8
+ module BlobHelper
9
+ # Internal: Get a Pathname wrapper for Blob#name
10
+ #
11
+ # Returns a Pathname.
12
+ def pathname
13
+ Pathname.new(name || "")
14
+ end
15
+
16
+ # Public: Get the extname of the path
17
+ #
18
+ # Examples
19
+ #
20
+ # blob(name='foo.rb').extname
21
+ # # => '.rb'
22
+ #
23
+ # Returns a String
24
+ def extname
25
+ pathname.extname
26
+ end
27
+
28
+ # Public: Get each line of data
29
+ #
30
+ # Requires Blob#data
31
+ #
32
+ # Returns an Array of lines
33
+ def lines
34
+ @lines ||= (data ? data.split("\n", -1) : [])
35
+ end
36
+
37
+ # Public: Get number of lines of code
38
+ #
39
+ # Requires Blob#data
40
+ #
41
+ # Returns Integer
42
+ def loc
43
+ lines.size
44
+ end
45
+
46
+ # Public: Get number of source lines of code
47
+ #
48
+ # Requires Blob#data
49
+ #
50
+ # Returns Integer
51
+ def sloc
52
+ lines.grep(/\S/).size
53
+ end
54
+
55
+ # Internal: Compute average line length.
56
+ #
57
+ # Returns Integer.
58
+ def average_line_length
59
+ if lines.any?
60
+ lines.inject(0) { |n, l| n += l.length } / lines.length
61
+ else
62
+ 0
63
+ end
64
+ end
65
+
66
+ # Public: Is the blob a generated file?
67
+ #
68
+ # Generated source code is supressed in diffs and is ignored by
69
+ # language statistics.
70
+ #
71
+ # Requires Blob#data
72
+ #
73
+ # Includes:
74
+ # - XCode project XML files
75
+ # - Visual Studio project XNL files
76
+ # - Minified JavaScript
77
+ #
78
+ # Please add additional test coverage to
79
+ # `test/test_blob.rb#test_generated` if you make any changes.
80
+ #
81
+ # Return true or false
82
+ def generated?
83
+ if xcode_project_file? || visual_studio_project_file?
84
+ true
85
+ elsif generated_coffeescript? || minified_javascript? || generated_net_docfile?
86
+ true
87
+ else
88
+ false
89
+ end
90
+ end
91
+
92
+ # Internal: Is the blob an XCode project file?
93
+ #
94
+ # Generated if the file extension is an XCode project
95
+ # file extension.
96
+ #
97
+ # Returns true of false.
98
+ def xcode_project_file?
99
+ ['.xib', '.nib', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
100
+ end
101
+
102
+ # Internal: Is the blob a Visual Studio project file?
103
+ #
104
+ # Generated if the file extension is a Visual Studio project
105
+ # file extension.
106
+ #
107
+ # Returns true of false.
108
+ def visual_studio_project_file?
109
+ ['.csproj', '.dbproj', '.fsproj', '.pyproj', '.rbproj', '.vbproj', '.vcxproj', '.wixproj', '.resx', '.sln', '.vdproj', '.isproj'].include?(extname)
110
+ end
111
+
112
+ # Internal: Is the blob minified JS?
113
+ #
114
+ # Consider JS minified if the average line length is
115
+ # greater then 100c.
116
+ #
117
+ # Returns true or false.
118
+ def minified_javascript?
119
+ return unless extname == '.js'
120
+ average_line_length > 100
121
+ end
122
+
123
+ # Internal: Is the blob JS generated by CoffeeScript?
124
+ #
125
+ # Requires Blob#data
126
+ #
127
+ # CoffeScript is meant to output JS that would be difficult to
128
+ # tell if it was generated or not. Look for a number of patterns
129
+ # outputed by the CS compiler.
130
+ #
131
+ # Return true or false
132
+ def generated_coffeescript?
133
+ return unless extname == '.js'
134
+
135
+ if lines[0] == '(function() {' && # First line is module closure opening
136
+ lines[-2] == '}).call(this);' && # Second to last line closes module closure
137
+ lines[-1] == '' # Last line is blank
138
+
139
+ score = 0
140
+
141
+ lines.each do |line|
142
+ if line =~ /var /
143
+ # Underscored temp vars are likely to be Coffee
144
+ score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
145
+
146
+ # bind and extend functions are very Coffee specific
147
+ score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
148
+ end
149
+ end
150
+
151
+ # Require a score of 3. This is fairly arbitrary. Consider
152
+ # tweaking later.
153
+ score >= 3
154
+ else
155
+ false
156
+ end
157
+ end
158
+
159
+ # Internal: Is this a generated documentation file for a .NET assembly?
160
+ #
161
+ # Requires Blob#data
162
+ #
163
+ # .NET developers often check in the XML Intellisense file along with an
164
+ # assembly - however, these don't have a special extension, so we have to
165
+ # dig into the contents to determine if it's a docfile. Luckily, these files
166
+ # are extremely structured, so recognizing them is easy.
167
+ #
168
+ # Returns true or false
169
+ def generated_net_docfile?
170
+ return false unless extname.downcase == ".xml"
171
+ return false unless lines.count > 3
172
+
173
+ # .NET Docfiles always open with <doc> and their first tag is an
174
+ # <assembly> tag
175
+ return lines[1].include?("<doc>") &&
176
+ lines[2].include?("<assembly>") &&
177
+ lines[-2].include?("</doc>")
178
+ end
179
+
180
+ # Public: Detects the Language of the blob.
181
+ #
182
+ # May load Blob#data
183
+ #
184
+ # Returns a Language or nil if none is detected
185
+ def language
186
+ if defined? @language
187
+ @language
188
+ else
189
+ @language = guess_language
190
+ end
191
+ end
192
+
193
+ # Internal: Guess language
194
+ #
195
+ # Please add additional test coverage to
196
+ # `test/test_blob.rb#test_language` if you make any changes.
197
+ #
198
+ # Returns a Language or nil
199
+ def guess_language
200
+ # Disambiguate between multiple language extensions
201
+ disambiguate_extension_language ||
202
+
203
+ # See if there is a Language for the extension
204
+ pathname.language ||
205
+
206
+ # Look for idioms in first line
207
+ first_line_language ||
208
+
209
+ # Try to detect Language from shebang line
210
+ shebang_language
211
+ end
212
+
213
+ # Internal: Disambiguates between multiple language extensions.
214
+ #
215
+ # Delegates to "guess_EXTENSION_language".
216
+ #
217
+ # Please add additional test coverage to
218
+ # `test/test_blob.rb#test_language` if you add another method.
219
+ #
220
+ # Returns a Language or nil.
221
+ def disambiguate_extension_language
222
+ if Language.ambiguous?(extname)
223
+ name = "guess_#{extname.sub(/^\./, '')}_language"
224
+ send(name) if respond_to?(name)
225
+ end
226
+ end
227
+
228
+ # Internal: Guess language of header files (.h).
229
+ #
230
+ # Returns a Language.
231
+ def guess_h_language
232
+ if lines.grep(/^@(interface|property|private|public|end)/).any?
233
+ Language['Objective-C']
234
+ elsif lines.grep(/^class |^\s+(public|protected|private):/).any?
235
+ Language['C++']
236
+ else
237
+ Language['C']
238
+ end
239
+ end
240
+
241
+ # Internal: Guess language of .m files.
242
+ #
243
+ # Objective-C heuristics:
244
+ # * Keywords
245
+ #
246
+ # Matlab heuristics:
247
+ # * Leading function keyword
248
+ # * "%" comments
249
+ #
250
+ # Returns a Language.
251
+ def guess_m_language
252
+ # Objective-C keywords
253
+ if lines.grep(/^#import|@(interface|implementation|property|synthesize|end)/).any?
254
+ Language['Objective-C']
255
+
256
+ # File function
257
+ elsif lines.first.to_s =~ /^function /
258
+ Language['Matlab']
259
+
260
+ # Matlab comment
261
+ elsif lines.grep(/^%/).any?
262
+ Language['Matlab']
263
+
264
+ # Fallback to Objective-C, don't want any Matlab false positives
265
+ else
266
+ Language['Objective-C']
267
+ end
268
+ end
269
+
270
+ # Internal: Guess language of .pl files
271
+ #
272
+ # The rules for disambiguation are:
273
+ #
274
+ # 1. Many perl files begin with a shebang
275
+ # 2. Most Prolog source files have a rule somewhere (marked by the :- operator)
276
+ # 3. Default to Perl, because it is more popular
277
+ #
278
+ # Returns a Language.
279
+ def guess_pl_language
280
+ if shebang_script == 'perl'
281
+ Language['Perl']
282
+ elsif lines.grep(/:-/).any?
283
+ Language['Prolog']
284
+ else
285
+ Language['Perl']
286
+ end
287
+ end
288
+
289
+ # Internal: Guess language of .r files.
290
+ #
291
+ # Returns a Language.
292
+ def guess_r_language
293
+ if lines.grep(/(rebol|(:\s+func|make\s+object!|^\s*context)\s*\[)/i).any?
294
+ Language['Rebol']
295
+ else
296
+ Language['R']
297
+ end
298
+ end
299
+
300
+ # Internal: Guess language of .gsp files.
301
+ #
302
+ # Returns a Language.
303
+ def guess_gsp_language
304
+ if lines.grep(/<%|<%@|\$\{|<%|<g:|<meta name="layout"|<r:/).any?
305
+ Language['Groovy Server Pages']
306
+ else
307
+ Language['Gosu']
308
+ end
309
+ end
310
+
311
+ # Internal: Guess language from the first line.
312
+ #
313
+ # Look for leading "<?php"
314
+ #
315
+ # Returns a Language.
316
+ def first_line_language
317
+ if lines.first.to_s =~ /^<\?php/
318
+ Language['PHP']
319
+ end
320
+ end
321
+
322
+ # Internal: Extract the script name from the shebang line
323
+ #
324
+ # Requires Blob#data
325
+ #
326
+ # Examples
327
+ #
328
+ # '#!/usr/bin/ruby'
329
+ # # => 'ruby'
330
+ #
331
+ # '#!/usr/bin/env ruby'
332
+ # # => 'ruby'
333
+ #
334
+ # '#!/usr/bash/python2.4'
335
+ # # => 'python'
336
+ #
337
+ # Please add additional test coverage to
338
+ # `test/test_blob.rb#test_shebang_script` if you make any changes.
339
+ #
340
+ # Returns a script name String or nil
341
+ def shebang_script
342
+ if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
343
+ bang.sub!(/^#! /, '#!')
344
+ tokens = bang.split(' ')
345
+ pieces = tokens.first.split('/')
346
+ if pieces.size > 1
347
+ script = pieces.last
348
+ else
349
+ script = pieces.first.sub('#!', '')
350
+ end
351
+
352
+ script = script == 'env' ? tokens[1] : script
353
+
354
+ # python2.4 => python
355
+ if script =~ /((?:\d+\.?)+)/
356
+ script.sub! $1, ''
357
+ end
358
+
359
+ # Check for multiline shebang hacks that exec themselves
360
+ #
361
+ # #!/bin/sh
362
+ # exec foo "$0" "$@"
363
+ #
364
+ if script == 'sh' &&
365
+ lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
366
+ script = $1
367
+ end
368
+
369
+ script
370
+ end
371
+ end
372
+
373
+ # Internal: Get Language for shebang script
374
+ #
375
+ # Returns the Language or nil
376
+ def shebang_language
377
+ if script = shebang_script
378
+ Language[script]
379
+ end
380
+ end
381
+
382
+ Language.overridden_extensions.each do |extension|
383
+ name = "guess_#{extension.sub(/^\./, '')}_language".to_sym
384
+ unless instance_methods.map(&:to_sym).include?(name)
385
+ warn "Language##{name} was not defined"
386
+ end
387
+ end
388
+ end
389
+ end
@@ -0,0 +1,43 @@
1
+ require 'language_sniffer/blob_helper'
2
+
3
+ module LanguageSniffer
4
+ # A FileBlob is a wrapper around a File object to make it quack
5
+ # like a Grit::Blob. It provides the basic interface: `name`,
6
+ # `data`, and `size`.
7
+ class FileBlob
8
+ include BlobHelper
9
+
10
+ # Public: Initialize a new FileBlob from a path
11
+ #
12
+ # path - A path String that exists on the file system.
13
+ # base_path - Optional base to relativize the path
14
+ #
15
+ # Returns a FileBlob.
16
+ def initialize(path, base_path = nil, data=nil)
17
+ @path = path
18
+ @name = base_path ? path.sub("#{base_path}/", '') : path
19
+ @data = data
20
+ end
21
+
22
+ # Public: Filename
23
+ #
24
+ # Examples
25
+ #
26
+ # FileBlob.new("/path/to/language_sniffer/lib/language_sniffer.rb").name
27
+ # # => "/path/to/language_sniffer/lib/language_sniffer.rb"
28
+ #
29
+ # FileBlob.new("/path/to/language_sniffer/lib/language_sniffer.rb",
30
+ # "/path/to/language_sniffer").name
31
+ # # => "lib/language_sniffer.rb"
32
+ #
33
+ # Returns a String
34
+ attr_reader :name
35
+
36
+ # Public: Read file contents.
37
+ #
38
+ # Returns a String.
39
+ def data
40
+ @data ||= File.read(@path)
41
+ end
42
+ end
43
+ end