github-linguist 2.1.2 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/linguist.rb +1 -0
- data/lib/linguist/blob_helper.rb +7 -241
- data/lib/linguist/generated.rb +161 -0
- data/lib/linguist/language.rb +37 -54
- data/lib/linguist/languages.yml +5 -41
- data/lib/linguist/samples.json +12055 -5573
- data/lib/linguist/samples.rb +7 -5
- data/lib/linguist/tokenizer.rb +47 -5
- metadata +3 -2
data/lib/linguist.rb
CHANGED
data/lib/linguist/blob_helper.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
|
-
require 'linguist/
|
1
|
+
require 'linguist/generated'
|
2
2
|
require 'linguist/language'
|
3
3
|
require 'linguist/mime'
|
4
|
-
require 'linguist/samples'
|
5
4
|
|
6
5
|
require 'charlock_holmes'
|
7
6
|
require 'escape_utils'
|
@@ -129,15 +128,6 @@ module Linguist
|
|
129
128
|
['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
|
130
129
|
end
|
131
130
|
|
132
|
-
# Public: Is the blob likely to have a shebang?
|
133
|
-
#
|
134
|
-
# Return true or false
|
135
|
-
def shebang_extname?
|
136
|
-
extname.empty? &&
|
137
|
-
mode &&
|
138
|
-
(mode.to_i(8) & 05) == 05
|
139
|
-
end
|
140
|
-
|
141
131
|
MEGABYTE = 1024 * 1024
|
142
132
|
|
143
133
|
# Public: Is the blob too big to load?
|
@@ -221,143 +211,16 @@ module Linguist
|
|
221
211
|
lines.grep(/\S/).size
|
222
212
|
end
|
223
213
|
|
224
|
-
# Internal: Compute average line length.
|
225
|
-
#
|
226
|
-
# Returns Integer.
|
227
|
-
def average_line_length
|
228
|
-
if lines.any?
|
229
|
-
lines.inject(0) { |n, l| n += l.length } / lines.length
|
230
|
-
else
|
231
|
-
0
|
232
|
-
end
|
233
|
-
end
|
234
|
-
|
235
214
|
# Public: Is the blob a generated file?
|
236
215
|
#
|
237
216
|
# Generated source code is supressed in diffs and is ignored by
|
238
217
|
# language statistics.
|
239
218
|
#
|
240
|
-
#
|
241
|
-
#
|
242
|
-
# Includes:
|
243
|
-
# - XCode project XML files
|
244
|
-
# - Minified JavaScript
|
245
|
-
# - Compiled CoffeeScript
|
246
|
-
# - PEG.js-generated parsers
|
247
|
-
#
|
248
|
-
# Please add additional test coverage to
|
249
|
-
# `test/test_blob.rb#test_generated` if you make any changes.
|
219
|
+
# May load Blob#data
|
250
220
|
#
|
251
221
|
# Return true or false
|
252
222
|
def generated?
|
253
|
-
|
254
|
-
xcode_project_file? || generated_net_docfile? || generated_parser?
|
255
|
-
true
|
256
|
-
else
|
257
|
-
false
|
258
|
-
end
|
259
|
-
end
|
260
|
-
|
261
|
-
# Internal: Is the blob an XCode project file?
|
262
|
-
#
|
263
|
-
# Generated if the file extension is an XCode project
|
264
|
-
# file extension.
|
265
|
-
#
|
266
|
-
# Returns true of false.
|
267
|
-
def xcode_project_file?
|
268
|
-
['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
|
269
|
-
end
|
270
|
-
|
271
|
-
# Internal: Is the blob minified JS?
|
272
|
-
#
|
273
|
-
# Consider JS minified if the average line length is
|
274
|
-
# greater then 100c.
|
275
|
-
#
|
276
|
-
# Returns true or false.
|
277
|
-
def minified_javascript?
|
278
|
-
return unless extname == '.js'
|
279
|
-
average_line_length > 100
|
280
|
-
end
|
281
|
-
|
282
|
-
# Internal: Is the blob of JS a parser generated by PEG.js?
|
283
|
-
#
|
284
|
-
# Requires Blob#data
|
285
|
-
#
|
286
|
-
# PEG.js-generated parsers are not meant to be consumed by humans.
|
287
|
-
#
|
288
|
-
# Return true or false
|
289
|
-
def generated_parser?
|
290
|
-
return false unless extname == '.js'
|
291
|
-
|
292
|
-
# PEG.js-generated parsers include a comment near the top of the file
|
293
|
-
# that marks them as such.
|
294
|
-
if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
|
295
|
-
return true
|
296
|
-
end
|
297
|
-
|
298
|
-
false
|
299
|
-
end
|
300
|
-
|
301
|
-
# Internal: Is the blob of JS generated by CoffeeScript?
|
302
|
-
#
|
303
|
-
# Requires Blob#data
|
304
|
-
#
|
305
|
-
# CoffeScript is meant to output JS that would be difficult to
|
306
|
-
# tell if it was generated or not. Look for a number of patterns
|
307
|
-
# output by the CS compiler.
|
308
|
-
#
|
309
|
-
# Return true or false
|
310
|
-
def compiled_coffeescript?
|
311
|
-
return false unless extname == '.js'
|
312
|
-
|
313
|
-
# CoffeeScript generated by > 1.2 include a comment on the first line
|
314
|
-
if lines[0] =~ /^\/\/ Generated by /
|
315
|
-
return true
|
316
|
-
end
|
317
|
-
|
318
|
-
if lines[0] == '(function() {' && # First line is module closure opening
|
319
|
-
lines[-2] == '}).call(this);' && # Second to last line closes module closure
|
320
|
-
lines[-1] == '' # Last line is blank
|
321
|
-
|
322
|
-
score = 0
|
323
|
-
|
324
|
-
lines.each do |line|
|
325
|
-
if line =~ /var /
|
326
|
-
# Underscored temp vars are likely to be Coffee
|
327
|
-
score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
|
328
|
-
|
329
|
-
# bind and extend functions are very Coffee specific
|
330
|
-
score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
|
331
|
-
end
|
332
|
-
end
|
333
|
-
|
334
|
-
# Require a score of 3. This is fairly arbitrary. Consider
|
335
|
-
# tweaking later.
|
336
|
-
score >= 3
|
337
|
-
else
|
338
|
-
false
|
339
|
-
end
|
340
|
-
end
|
341
|
-
|
342
|
-
# Internal: Is this a generated documentation file for a .NET assembly?
|
343
|
-
#
|
344
|
-
# Requires Blob#data
|
345
|
-
#
|
346
|
-
# .NET developers often check in the XML Intellisense file along with an
|
347
|
-
# assembly - however, these don't have a special extension, so we have to
|
348
|
-
# dig into the contents to determine if it's a docfile. Luckily, these files
|
349
|
-
# are extremely structured, so recognizing them is easy.
|
350
|
-
#
|
351
|
-
# Returns true or false
|
352
|
-
def generated_net_docfile?
|
353
|
-
return false unless extname.downcase == ".xml"
|
354
|
-
return false unless lines.count > 3
|
355
|
-
|
356
|
-
# .NET Docfiles always open with <doc> and their first tag is an
|
357
|
-
# <assembly> tag
|
358
|
-
return lines[1].include?("<doc>") &&
|
359
|
-
lines[2].include?("<assembly>") &&
|
360
|
-
lines[-2].include?("</doc>")
|
223
|
+
@_generated ||= Generated.generated?(name, lambda { data })
|
361
224
|
end
|
362
225
|
|
363
226
|
# Public: Should the blob be indexed for searching?
|
@@ -375,6 +238,8 @@ module Linguist
|
|
375
238
|
def indexable?
|
376
239
|
if binary?
|
377
240
|
false
|
241
|
+
elsif extname == '.txt'
|
242
|
+
true
|
378
243
|
elsif language.nil?
|
379
244
|
false
|
380
245
|
elsif !language.searchable?
|
@@ -396,30 +261,11 @@ module Linguist
|
|
396
261
|
def language
|
397
262
|
if defined? @language
|
398
263
|
@language
|
399
|
-
|
400
|
-
@language =
|
264
|
+
elsif !binary_mime_type?
|
265
|
+
@language = Language.detect(name.to_s, lambda { data }, mode)
|
401
266
|
end
|
402
267
|
end
|
403
268
|
|
404
|
-
# Internal: Guess language
|
405
|
-
#
|
406
|
-
# Please add additional test coverage to
|
407
|
-
# `test/test_blob.rb#test_language` if you make any changes.
|
408
|
-
#
|
409
|
-
# Returns a Language or nil
|
410
|
-
def guess_language
|
411
|
-
return if binary_mime_type?
|
412
|
-
|
413
|
-
# Disambiguate between multiple language extensions
|
414
|
-
disambiguate_extension_language ||
|
415
|
-
|
416
|
-
# See if there is a Language for the extension
|
417
|
-
Language.find_by_filename(name.to_s) ||
|
418
|
-
|
419
|
-
# Try to detect Language from shebang line
|
420
|
-
shebang_language
|
421
|
-
end
|
422
|
-
|
423
269
|
# Internal: Get the lexer of the blob.
|
424
270
|
#
|
425
271
|
# Returns a Lexer.
|
@@ -427,86 +273,6 @@ module Linguist
|
|
427
273
|
language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
|
428
274
|
end
|
429
275
|
|
430
|
-
# Internal: Disambiguates between multiple language extensions.
|
431
|
-
#
|
432
|
-
# Returns a Language or nil.
|
433
|
-
def disambiguate_extension_language
|
434
|
-
if Language.ambiguous?(extname)
|
435
|
-
possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
|
436
|
-
if possible_languages.any?
|
437
|
-
if result = Classifier.classify(Samples::DATA, data, possible_languages).first
|
438
|
-
Language[result[0]]
|
439
|
-
end
|
440
|
-
end
|
441
|
-
end
|
442
|
-
end
|
443
|
-
|
444
|
-
# Internal: Extract the script name from the shebang line
|
445
|
-
#
|
446
|
-
# Requires Blob#data
|
447
|
-
#
|
448
|
-
# Examples
|
449
|
-
#
|
450
|
-
# '#!/usr/bin/ruby'
|
451
|
-
# # => 'ruby'
|
452
|
-
#
|
453
|
-
# '#!/usr/bin/env ruby'
|
454
|
-
# # => 'ruby'
|
455
|
-
#
|
456
|
-
# '#!/usr/bash/python2.4'
|
457
|
-
# # => 'python'
|
458
|
-
#
|
459
|
-
# Please add additional test coverage to
|
460
|
-
# `test/test_blob.rb#test_shebang_script` if you make any changes.
|
461
|
-
#
|
462
|
-
# Returns a script name String or nil
|
463
|
-
def shebang_script
|
464
|
-
# Fail fast if blob isn't viewable?
|
465
|
-
return unless viewable?
|
466
|
-
|
467
|
-
if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
|
468
|
-
bang.sub!(/^#! /, '#!')
|
469
|
-
tokens = bang.split(' ')
|
470
|
-
pieces = tokens.first.split('/')
|
471
|
-
if pieces.size > 1
|
472
|
-
script = pieces.last
|
473
|
-
else
|
474
|
-
script = pieces.first.sub('#!', '')
|
475
|
-
end
|
476
|
-
|
477
|
-
script = script == 'env' ? tokens[1] : script
|
478
|
-
|
479
|
-
# python2.4 => python
|
480
|
-
if script =~ /((?:\d+\.?)+)/
|
481
|
-
script.sub! $1, ''
|
482
|
-
end
|
483
|
-
|
484
|
-
# Check for multiline shebang hacks that exec themselves
|
485
|
-
#
|
486
|
-
# #!/bin/sh
|
487
|
-
# exec foo "$0" "$@"
|
488
|
-
#
|
489
|
-
if script == 'sh' &&
|
490
|
-
lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
|
491
|
-
script = $1
|
492
|
-
end
|
493
|
-
|
494
|
-
script
|
495
|
-
end
|
496
|
-
end
|
497
|
-
|
498
|
-
# Internal: Get Language for shebang script
|
499
|
-
#
|
500
|
-
# Returns the Language or nil
|
501
|
-
def shebang_language
|
502
|
-
# Skip file extensions unlikely to have shebangs
|
503
|
-
return unless shebang_extname?
|
504
|
-
|
505
|
-
if script = shebang_script
|
506
|
-
Language[script]
|
507
|
-
end
|
508
|
-
end
|
509
|
-
|
510
276
|
# Public: Highlight syntax of blob
|
511
277
|
#
|
512
278
|
# options - A Hash of options (defaults to {})
|
@@ -0,0 +1,161 @@
|
|
1
|
+
module Linguist
|
2
|
+
class Generated
|
3
|
+
# Public: Is the blob a generated file?
|
4
|
+
#
|
5
|
+
# name - String filename
|
6
|
+
# data - String blob data. A block also maybe passed in for lazy
|
7
|
+
# loading. This behavior is deprecated and you should always
|
8
|
+
# pass in a String.
|
9
|
+
#
|
10
|
+
# Return true or false
|
11
|
+
def self.generated?(name, data)
|
12
|
+
new(name, data).generated?
|
13
|
+
end
|
14
|
+
|
15
|
+
# Internal: Initialize Generated instance
|
16
|
+
#
|
17
|
+
# name - String filename
|
18
|
+
# data - String blob data
|
19
|
+
def initialize(name, data)
|
20
|
+
@name = name
|
21
|
+
@extname = File.extname(name)
|
22
|
+
@_data = data
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_reader :name, :extname
|
26
|
+
|
27
|
+
# Lazy load blob data if block was passed in.
|
28
|
+
#
|
29
|
+
# Awful, awful stuff happening here.
|
30
|
+
#
|
31
|
+
# Returns String data.
|
32
|
+
def data
|
33
|
+
@data ||= @_data.respond_to?(:call) ? @_data.call() : @_data
|
34
|
+
end
|
35
|
+
|
36
|
+
# Public: Get each line of data
|
37
|
+
#
|
38
|
+
# Returns an Array of lines
|
39
|
+
def lines
|
40
|
+
@lines ||= data.split("\n", -1)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Internal: Is the blob a generated file?
|
44
|
+
#
|
45
|
+
# Generated source code is supressed in diffs and is ignored by
|
46
|
+
# language statistics.
|
47
|
+
#
|
48
|
+
# Please add additional test coverage to
|
49
|
+
# `test/test_blob.rb#test_generated` if you make any changes.
|
50
|
+
#
|
51
|
+
# Return true or false
|
52
|
+
def generated?
|
53
|
+
name == 'Gemfile.lock' ||
|
54
|
+
minified_javascript? ||
|
55
|
+
compiled_coffeescript? ||
|
56
|
+
xcode_project_file? ||
|
57
|
+
generated_net_docfile? ||
|
58
|
+
generated_parser?
|
59
|
+
end
|
60
|
+
|
61
|
+
# Internal: Is the blob an XCode project file?
|
62
|
+
#
|
63
|
+
# Generated if the file extension is an XCode project
|
64
|
+
# file extension.
|
65
|
+
#
|
66
|
+
# Returns true of false.
|
67
|
+
def xcode_project_file?
|
68
|
+
['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Internal: Is the blob minified JS?
|
72
|
+
#
|
73
|
+
# Consider JS minified if the average line length is
|
74
|
+
# greater then 100c.
|
75
|
+
#
|
76
|
+
# Returns true or false.
|
77
|
+
def minified_javascript?
|
78
|
+
return unless extname == '.js'
|
79
|
+
if lines.any?
|
80
|
+
(lines.inject(0) { |n, l| n += l.length } / lines.length) > 100
|
81
|
+
else
|
82
|
+
false
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# Internal: Is the blob of JS generated by CoffeeScript?
|
87
|
+
#
|
88
|
+
# CoffeScript is meant to output JS that would be difficult to
|
89
|
+
# tell if it was generated or not. Look for a number of patterns
|
90
|
+
# output by the CS compiler.
|
91
|
+
#
|
92
|
+
# Return true or false
|
93
|
+
def compiled_coffeescript?
|
94
|
+
return false unless extname == '.js'
|
95
|
+
|
96
|
+
# CoffeeScript generated by > 1.2 include a comment on the first line
|
97
|
+
if lines[0] =~ /^\/\/ Generated by /
|
98
|
+
return true
|
99
|
+
end
|
100
|
+
|
101
|
+
if lines[0] == '(function() {' && # First line is module closure opening
|
102
|
+
lines[-2] == '}).call(this);' && # Second to last line closes module closure
|
103
|
+
lines[-1] == '' # Last line is blank
|
104
|
+
|
105
|
+
score = 0
|
106
|
+
|
107
|
+
lines.each do |line|
|
108
|
+
if line =~ /var /
|
109
|
+
# Underscored temp vars are likely to be Coffee
|
110
|
+
score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
|
111
|
+
|
112
|
+
# bind and extend functions are very Coffee specific
|
113
|
+
score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
# Require a score of 3. This is fairly arbitrary. Consider
|
118
|
+
# tweaking later.
|
119
|
+
score >= 3
|
120
|
+
else
|
121
|
+
false
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Internal: Is this a generated documentation file for a .NET assembly?
|
126
|
+
#
|
127
|
+
# .NET developers often check in the XML Intellisense file along with an
|
128
|
+
# assembly - however, these don't have a special extension, so we have to
|
129
|
+
# dig into the contents to determine if it's a docfile. Luckily, these files
|
130
|
+
# are extremely structured, so recognizing them is easy.
|
131
|
+
#
|
132
|
+
# Returns true or false
|
133
|
+
def generated_net_docfile?
|
134
|
+
return false unless extname.downcase == ".xml"
|
135
|
+
return false unless lines.count > 3
|
136
|
+
|
137
|
+
# .NET Docfiles always open with <doc> and their first tag is an
|
138
|
+
# <assembly> tag
|
139
|
+
return lines[1].include?("<doc>") &&
|
140
|
+
lines[2].include?("<assembly>") &&
|
141
|
+
lines[-2].include?("</doc>")
|
142
|
+
end
|
143
|
+
|
144
|
+
# Internal: Is the blob of JS a parser generated by PEG.js?
|
145
|
+
#
|
146
|
+
# PEG.js-generated parsers are not meant to be consumed by humans.
|
147
|
+
#
|
148
|
+
# Return true or false
|
149
|
+
def generated_parser?
|
150
|
+
return false unless extname == '.js'
|
151
|
+
|
152
|
+
# PEG.js-generated parsers include a comment near the top of the file
|
153
|
+
# that marks them as such.
|
154
|
+
if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
|
155
|
+
return true
|
156
|
+
end
|
157
|
+
|
158
|
+
false
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|