language_sniffer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'language_sniffer/file_blob'
4
+
5
+ path = ARGV[0] || ''
6
+
7
+ if File.directory?(path)
8
+ abort "Cannot parse a whole directory"
9
+ elsif File.file?(path)
10
+ blob = LanguageSniffer::FileBlob.new(path, Dir.pwd)
11
+
12
+ puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
13
+ puts " extension: #{blob.pathname.extname}"
14
+ puts " language: #{blob.language}"
15
+
16
+ if blob.generated?
17
+ puts " appears to be generated source code"
18
+ end
19
+ else
20
+ abort "usage: language_sniffer <file>"
21
+ end
@@ -0,0 +1,389 @@
1
+ require 'language_sniffer/language'
2
+ require 'language_sniffer/pathname'
3
+ require 'yaml'
4
+
5
+ module LanguageSniffer
6
+ # BlobHelper is a mixin for Blobish classes that respond to "name",
7
+ # "data" and "size" such as Grit::Blob.
8
+ module BlobHelper
9
+ # Internal: Get a Pathname wrapper for Blob#name
10
+ #
11
+ # Returns a Pathname.
12
+ def pathname
13
+ Pathname.new(name || "")
14
+ end
15
+
16
+ # Public: Get the extname of the path
17
+ #
18
+ # Examples
19
+ #
20
+ # blob(name='foo.rb').extname
21
+ # # => '.rb'
22
+ #
23
+ # Returns a String
24
+ def extname
25
+ pathname.extname
26
+ end
27
+
28
+ # Public: Get each line of data
29
+ #
30
+ # Requires Blob#data
31
+ #
32
+ # Returns an Array of lines
33
+ def lines
34
+ @lines ||= (data ? data.split("\n", -1) : [])
35
+ end
36
+
37
+ # Public: Get number of lines of code
38
+ #
39
+ # Requires Blob#data
40
+ #
41
+ # Returns Integer
42
+ def loc
43
+ lines.size
44
+ end
45
+
46
+ # Public: Get number of source lines of code
47
+ #
48
+ # Requires Blob#data
49
+ #
50
+ # Returns Integer
51
+ def sloc
52
+ lines.grep(/\S/).size
53
+ end
54
+
55
+ # Internal: Compute average line length.
56
+ #
57
+ # Returns Integer.
58
+ def average_line_length
59
+ if lines.any?
60
+ lines.inject(0) { |n, l| n += l.length } / lines.length
61
+ else
62
+ 0
63
+ end
64
+ end
65
+
66
+ # Public: Is the blob a generated file?
67
+ #
68
+ # Generated source code is supressed in diffs and is ignored by
69
+ # language statistics.
70
+ #
71
+ # Requires Blob#data
72
+ #
73
+ # Includes:
74
+ # - XCode project XML files
75
+ # - Visual Studio project XNL files
76
+ # - Minified JavaScript
77
+ #
78
+ # Please add additional test coverage to
79
+ # `test/test_blob.rb#test_generated` if you make any changes.
80
+ #
81
+ # Return true or false
82
+ def generated?
83
+ if xcode_project_file? || visual_studio_project_file?
84
+ true
85
+ elsif generated_coffeescript? || minified_javascript? || generated_net_docfile?
86
+ true
87
+ else
88
+ false
89
+ end
90
+ end
91
+
92
+ # Internal: Is the blob an XCode project file?
93
+ #
94
+ # Generated if the file extension is an XCode project
95
+ # file extension.
96
+ #
97
+ # Returns true of false.
98
+ def xcode_project_file?
99
+ ['.xib', '.nib', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
100
+ end
101
+
102
+ # Internal: Is the blob a Visual Studio project file?
103
+ #
104
+ # Generated if the file extension is a Visual Studio project
105
+ # file extension.
106
+ #
107
+ # Returns true of false.
108
+ def visual_studio_project_file?
109
+ ['.csproj', '.dbproj', '.fsproj', '.pyproj', '.rbproj', '.vbproj', '.vcxproj', '.wixproj', '.resx', '.sln', '.vdproj', '.isproj'].include?(extname)
110
+ end
111
+
112
+ # Internal: Is the blob minified JS?
113
+ #
114
+ # Consider JS minified if the average line length is
115
+ # greater then 100c.
116
+ #
117
+ # Returns true or false.
118
+ def minified_javascript?
119
+ return unless extname == '.js'
120
+ average_line_length > 100
121
+ end
122
+
123
+ # Internal: Is the blob JS generated by CoffeeScript?
124
+ #
125
+ # Requires Blob#data
126
+ #
127
+ # CoffeScript is meant to output JS that would be difficult to
128
+ # tell if it was generated or not. Look for a number of patterns
129
+ # outputed by the CS compiler.
130
+ #
131
+ # Return true or false
132
+ def generated_coffeescript?
133
+ return unless extname == '.js'
134
+
135
+ if lines[0] == '(function() {' && # First line is module closure opening
136
+ lines[-2] == '}).call(this);' && # Second to last line closes module closure
137
+ lines[-1] == '' # Last line is blank
138
+
139
+ score = 0
140
+
141
+ lines.each do |line|
142
+ if line =~ /var /
143
+ # Underscored temp vars are likely to be Coffee
144
+ score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
145
+
146
+ # bind and extend functions are very Coffee specific
147
+ score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
148
+ end
149
+ end
150
+
151
+ # Require a score of 3. This is fairly arbitrary. Consider
152
+ # tweaking later.
153
+ score >= 3
154
+ else
155
+ false
156
+ end
157
+ end
158
+
159
+ # Internal: Is this a generated documentation file for a .NET assembly?
160
+ #
161
+ # Requires Blob#data
162
+ #
163
+ # .NET developers often check in the XML Intellisense file along with an
164
+ # assembly - however, these don't have a special extension, so we have to
165
+ # dig into the contents to determine if it's a docfile. Luckily, these files
166
+ # are extremely structured, so recognizing them is easy.
167
+ #
168
+ # Returns true or false
169
+ def generated_net_docfile?
170
+ return false unless extname.downcase == ".xml"
171
+ return false unless lines.count > 3
172
+
173
+ # .NET Docfiles always open with <doc> and their first tag is an
174
+ # <assembly> tag
175
+ return lines[1].include?("<doc>") &&
176
+ lines[2].include?("<assembly>") &&
177
+ lines[-2].include?("</doc>")
178
+ end
179
+
180
+ # Public: Detects the Language of the blob.
181
+ #
182
+ # May load Blob#data
183
+ #
184
+ # Returns a Language or nil if none is detected
185
+ def language
186
+ if defined? @language
187
+ @language
188
+ else
189
+ @language = guess_language
190
+ end
191
+ end
192
+
193
+ # Internal: Guess language
194
+ #
195
+ # Please add additional test coverage to
196
+ # `test/test_blob.rb#test_language` if you make any changes.
197
+ #
198
+ # Returns a Language or nil
199
+ def guess_language
200
+ # Disambiguate between multiple language extensions
201
+ disambiguate_extension_language ||
202
+
203
+ # See if there is a Language for the extension
204
+ pathname.language ||
205
+
206
+ # Look for idioms in first line
207
+ first_line_language ||
208
+
209
+ # Try to detect Language from shebang line
210
+ shebang_language
211
+ end
212
+
213
+ # Internal: Disambiguates between multiple language extensions.
214
+ #
215
+ # Delegates to "guess_EXTENSION_language".
216
+ #
217
+ # Please add additional test coverage to
218
+ # `test/test_blob.rb#test_language` if you add another method.
219
+ #
220
+ # Returns a Language or nil.
221
+ def disambiguate_extension_language
222
+ if Language.ambiguous?(extname)
223
+ name = "guess_#{extname.sub(/^\./, '')}_language"
224
+ send(name) if respond_to?(name)
225
+ end
226
+ end
227
+
228
+ # Internal: Guess language of header files (.h).
229
+ #
230
+ # Returns a Language.
231
+ def guess_h_language
232
+ if lines.grep(/^@(interface|property|private|public|end)/).any?
233
+ Language['Objective-C']
234
+ elsif lines.grep(/^class |^\s+(public|protected|private):/).any?
235
+ Language['C++']
236
+ else
237
+ Language['C']
238
+ end
239
+ end
240
+
241
+ # Internal: Guess language of .m files.
242
+ #
243
+ # Objective-C heuristics:
244
+ # * Keywords
245
+ #
246
+ # Matlab heuristics:
247
+ # * Leading function keyword
248
+ # * "%" comments
249
+ #
250
+ # Returns a Language.
251
+ def guess_m_language
252
+ # Objective-C keywords
253
+ if lines.grep(/^#import|@(interface|implementation|property|synthesize|end)/).any?
254
+ Language['Objective-C']
255
+
256
+ # File function
257
+ elsif lines.first.to_s =~ /^function /
258
+ Language['Matlab']
259
+
260
+ # Matlab comment
261
+ elsif lines.grep(/^%/).any?
262
+ Language['Matlab']
263
+
264
+ # Fallback to Objective-C, don't want any Matlab false positives
265
+ else
266
+ Language['Objective-C']
267
+ end
268
+ end
269
+
270
+ # Internal: Guess language of .pl files
271
+ #
272
+ # The rules for disambiguation are:
273
+ #
274
+ # 1. Many perl files begin with a shebang
275
+ # 2. Most Prolog source files have a rule somewhere (marked by the :- operator)
276
+ # 3. Default to Perl, because it is more popular
277
+ #
278
+ # Returns a Language.
279
+ def guess_pl_language
280
+ if shebang_script == 'perl'
281
+ Language['Perl']
282
+ elsif lines.grep(/:-/).any?
283
+ Language['Prolog']
284
+ else
285
+ Language['Perl']
286
+ end
287
+ end
288
+
289
+ # Internal: Guess language of .r files.
290
+ #
291
+ # Returns a Language.
292
+ def guess_r_language
293
+ if lines.grep(/(rebol|(:\s+func|make\s+object!|^\s*context)\s*\[)/i).any?
294
+ Language['Rebol']
295
+ else
296
+ Language['R']
297
+ end
298
+ end
299
+
300
+ # Internal: Guess language of .gsp files.
301
+ #
302
+ # Returns a Language.
303
+ def guess_gsp_language
304
+ if lines.grep(/<%|<%@|\$\{|<%|<g:|<meta name="layout"|<r:/).any?
305
+ Language['Groovy Server Pages']
306
+ else
307
+ Language['Gosu']
308
+ end
309
+ end
310
+
311
+ # Internal: Guess language from the first line.
312
+ #
313
+ # Look for leading "<?php"
314
+ #
315
+ # Returns a Language.
316
+ def first_line_language
317
+ if lines.first.to_s =~ /^<\?php/
318
+ Language['PHP']
319
+ end
320
+ end
321
+
322
+ # Internal: Extract the script name from the shebang line
323
+ #
324
+ # Requires Blob#data
325
+ #
326
+ # Examples
327
+ #
328
+ # '#!/usr/bin/ruby'
329
+ # # => 'ruby'
330
+ #
331
+ # '#!/usr/bin/env ruby'
332
+ # # => 'ruby'
333
+ #
334
+ # '#!/usr/bash/python2.4'
335
+ # # => 'python'
336
+ #
337
+ # Please add additional test coverage to
338
+ # `test/test_blob.rb#test_shebang_script` if you make any changes.
339
+ #
340
+ # Returns a script name String or nil
341
+ def shebang_script
342
+ if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
343
+ bang.sub!(/^#! /, '#!')
344
+ tokens = bang.split(' ')
345
+ pieces = tokens.first.split('/')
346
+ if pieces.size > 1
347
+ script = pieces.last
348
+ else
349
+ script = pieces.first.sub('#!', '')
350
+ end
351
+
352
+ script = script == 'env' ? tokens[1] : script
353
+
354
+ # python2.4 => python
355
+ if script =~ /((?:\d+\.?)+)/
356
+ script.sub! $1, ''
357
+ end
358
+
359
+ # Check for multiline shebang hacks that exec themselves
360
+ #
361
+ # #!/bin/sh
362
+ # exec foo "$0" "$@"
363
+ #
364
+ if script == 'sh' &&
365
+ lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
366
+ script = $1
367
+ end
368
+
369
+ script
370
+ end
371
+ end
372
+
373
+ # Internal: Get Language for shebang script
374
+ #
375
+ # Returns the Language or nil
376
+ def shebang_language
377
+ if script = shebang_script
378
+ Language[script]
379
+ end
380
+ end
381
+
382
+ Language.overridden_extensions.each do |extension|
383
+ name = "guess_#{extension.sub(/^\./, '')}_language".to_sym
384
+ unless instance_methods.map(&:to_sym).include?(name)
385
+ warn "Language##{name} was not defined"
386
+ end
387
+ end
388
+ end
389
+ end
@@ -0,0 +1,43 @@
1
+ require 'language_sniffer/blob_helper'
2
+
3
+ module LanguageSniffer
4
+ # A FileBlob is a wrapper around a File object to make it quack
5
+ # like a Grit::Blob. It provides the basic interface: `name`,
6
+ # `data`, and `size`.
7
+ class FileBlob
8
+ include BlobHelper
9
+
10
+ # Public: Initialize a new FileBlob from a path
11
+ #
12
+ # path - A path String that exists on the file system.
13
+ # base_path - Optional base to relativize the path
14
+ #
15
+ # Returns a FileBlob.
16
+ def initialize(path, base_path = nil, data=nil)
17
+ @path = path
18
+ @name = base_path ? path.sub("#{base_path}/", '') : path
19
+ @data = data
20
+ end
21
+
22
+ # Public: Filename
23
+ #
24
+ # Examples
25
+ #
26
+ # FileBlob.new("/path/to/language_sniffer/lib/language_sniffer.rb").name
27
+ # # => "/path/to/language_sniffer/lib/language_sniffer.rb"
28
+ #
29
+ # FileBlob.new("/path/to/language_sniffer/lib/language_sniffer.rb",
30
+ # "/path/to/language_sniffer").name
31
+ # # => "lib/language_sniffer.rb"
32
+ #
33
+ # Returns a String
34
+ attr_reader :name
35
+
36
+ # Public: Read file contents.
37
+ #
38
+ # Returns a String.
39
+ def data
40
+ @data ||= File.read(@path)
41
+ end
42
+ end
43
+ end