github-linguist 2.1.2 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/linguist.rb +1 -0
- data/lib/linguist/blob_helper.rb +7 -241
- data/lib/linguist/generated.rb +161 -0
- data/lib/linguist/language.rb +37 -54
- data/lib/linguist/languages.yml +5 -41
- data/lib/linguist/samples.json +12055 -5573
- data/lib/linguist/samples.rb +7 -5
- data/lib/linguist/tokenizer.rb +47 -5
- metadata +3 -2
data/lib/linguist.rb
CHANGED
data/lib/linguist/blob_helper.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
|
-
require 'linguist/
|
1
|
+
require 'linguist/generated'
|
2
2
|
require 'linguist/language'
|
3
3
|
require 'linguist/mime'
|
4
|
-
require 'linguist/samples'
|
5
4
|
|
6
5
|
require 'charlock_holmes'
|
7
6
|
require 'escape_utils'
|
@@ -129,15 +128,6 @@ module Linguist
|
|
129
128
|
['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
|
130
129
|
end
|
131
130
|
|
132
|
-
# Public: Is the blob likely to have a shebang?
|
133
|
-
#
|
134
|
-
# Return true or false
|
135
|
-
def shebang_extname?
|
136
|
-
extname.empty? &&
|
137
|
-
mode &&
|
138
|
-
(mode.to_i(8) & 05) == 05
|
139
|
-
end
|
140
|
-
|
141
131
|
MEGABYTE = 1024 * 1024
|
142
132
|
|
143
133
|
# Public: Is the blob too big to load?
|
@@ -221,143 +211,16 @@ module Linguist
|
|
221
211
|
lines.grep(/\S/).size
|
222
212
|
end
|
223
213
|
|
224
|
-
# Internal: Compute average line length.
|
225
|
-
#
|
226
|
-
# Returns Integer.
|
227
|
-
def average_line_length
|
228
|
-
if lines.any?
|
229
|
-
lines.inject(0) { |n, l| n += l.length } / lines.length
|
230
|
-
else
|
231
|
-
0
|
232
|
-
end
|
233
|
-
end
|
234
|
-
|
235
214
|
# Public: Is the blob a generated file?
|
236
215
|
#
|
237
216
|
# Generated source code is supressed in diffs and is ignored by
|
238
217
|
# language statistics.
|
239
218
|
#
|
240
|
-
#
|
241
|
-
#
|
242
|
-
# Includes:
|
243
|
-
# - XCode project XML files
|
244
|
-
# - Minified JavaScript
|
245
|
-
# - Compiled CoffeeScript
|
246
|
-
# - PEG.js-generated parsers
|
247
|
-
#
|
248
|
-
# Please add additional test coverage to
|
249
|
-
# `test/test_blob.rb#test_generated` if you make any changes.
|
219
|
+
# May load Blob#data
|
250
220
|
#
|
251
221
|
# Return true or false
|
252
222
|
def generated?
|
253
|
-
|
254
|
-
xcode_project_file? || generated_net_docfile? || generated_parser?
|
255
|
-
true
|
256
|
-
else
|
257
|
-
false
|
258
|
-
end
|
259
|
-
end
|
260
|
-
|
261
|
-
# Internal: Is the blob an XCode project file?
|
262
|
-
#
|
263
|
-
# Generated if the file extension is an XCode project
|
264
|
-
# file extension.
|
265
|
-
#
|
266
|
-
# Returns true of false.
|
267
|
-
def xcode_project_file?
|
268
|
-
['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
|
269
|
-
end
|
270
|
-
|
271
|
-
# Internal: Is the blob minified JS?
|
272
|
-
#
|
273
|
-
# Consider JS minified if the average line length is
|
274
|
-
# greater then 100c.
|
275
|
-
#
|
276
|
-
# Returns true or false.
|
277
|
-
def minified_javascript?
|
278
|
-
return unless extname == '.js'
|
279
|
-
average_line_length > 100
|
280
|
-
end
|
281
|
-
|
282
|
-
# Internal: Is the blob of JS a parser generated by PEG.js?
|
283
|
-
#
|
284
|
-
# Requires Blob#data
|
285
|
-
#
|
286
|
-
# PEG.js-generated parsers are not meant to be consumed by humans.
|
287
|
-
#
|
288
|
-
# Return true or false
|
289
|
-
def generated_parser?
|
290
|
-
return false unless extname == '.js'
|
291
|
-
|
292
|
-
# PEG.js-generated parsers include a comment near the top of the file
|
293
|
-
# that marks them as such.
|
294
|
-
if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
|
295
|
-
return true
|
296
|
-
end
|
297
|
-
|
298
|
-
false
|
299
|
-
end
|
300
|
-
|
301
|
-
# Internal: Is the blob of JS generated by CoffeeScript?
|
302
|
-
#
|
303
|
-
# Requires Blob#data
|
304
|
-
#
|
305
|
-
# CoffeScript is meant to output JS that would be difficult to
|
306
|
-
# tell if it was generated or not. Look for a number of patterns
|
307
|
-
# output by the CS compiler.
|
308
|
-
#
|
309
|
-
# Return true or false
|
310
|
-
def compiled_coffeescript?
|
311
|
-
return false unless extname == '.js'
|
312
|
-
|
313
|
-
# CoffeeScript generated by > 1.2 include a comment on the first line
|
314
|
-
if lines[0] =~ /^\/\/ Generated by /
|
315
|
-
return true
|
316
|
-
end
|
317
|
-
|
318
|
-
if lines[0] == '(function() {' && # First line is module closure opening
|
319
|
-
lines[-2] == '}).call(this);' && # Second to last line closes module closure
|
320
|
-
lines[-1] == '' # Last line is blank
|
321
|
-
|
322
|
-
score = 0
|
323
|
-
|
324
|
-
lines.each do |line|
|
325
|
-
if line =~ /var /
|
326
|
-
# Underscored temp vars are likely to be Coffee
|
327
|
-
score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
|
328
|
-
|
329
|
-
# bind and extend functions are very Coffee specific
|
330
|
-
score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
|
331
|
-
end
|
332
|
-
end
|
333
|
-
|
334
|
-
# Require a score of 3. This is fairly arbitrary. Consider
|
335
|
-
# tweaking later.
|
336
|
-
score >= 3
|
337
|
-
else
|
338
|
-
false
|
339
|
-
end
|
340
|
-
end
|
341
|
-
|
342
|
-
# Internal: Is this a generated documentation file for a .NET assembly?
|
343
|
-
#
|
344
|
-
# Requires Blob#data
|
345
|
-
#
|
346
|
-
# .NET developers often check in the XML Intellisense file along with an
|
347
|
-
# assembly - however, these don't have a special extension, so we have to
|
348
|
-
# dig into the contents to determine if it's a docfile. Luckily, these files
|
349
|
-
# are extremely structured, so recognizing them is easy.
|
350
|
-
#
|
351
|
-
# Returns true or false
|
352
|
-
def generated_net_docfile?
|
353
|
-
return false unless extname.downcase == ".xml"
|
354
|
-
return false unless lines.count > 3
|
355
|
-
|
356
|
-
# .NET Docfiles always open with <doc> and their first tag is an
|
357
|
-
# <assembly> tag
|
358
|
-
return lines[1].include?("<doc>") &&
|
359
|
-
lines[2].include?("<assembly>") &&
|
360
|
-
lines[-2].include?("</doc>")
|
223
|
+
@_generated ||= Generated.generated?(name, lambda { data })
|
361
224
|
end
|
362
225
|
|
363
226
|
# Public: Should the blob be indexed for searching?
|
@@ -375,6 +238,8 @@ module Linguist
|
|
375
238
|
def indexable?
|
376
239
|
if binary?
|
377
240
|
false
|
241
|
+
elsif extname == '.txt'
|
242
|
+
true
|
378
243
|
elsif language.nil?
|
379
244
|
false
|
380
245
|
elsif !language.searchable?
|
@@ -396,30 +261,11 @@ module Linguist
|
|
396
261
|
def language
|
397
262
|
if defined? @language
|
398
263
|
@language
|
399
|
-
|
400
|
-
@language =
|
264
|
+
elsif !binary_mime_type?
|
265
|
+
@language = Language.detect(name.to_s, lambda { data }, mode)
|
401
266
|
end
|
402
267
|
end
|
403
268
|
|
404
|
-
# Internal: Guess language
|
405
|
-
#
|
406
|
-
# Please add additional test coverage to
|
407
|
-
# `test/test_blob.rb#test_language` if you make any changes.
|
408
|
-
#
|
409
|
-
# Returns a Language or nil
|
410
|
-
def guess_language
|
411
|
-
return if binary_mime_type?
|
412
|
-
|
413
|
-
# Disambiguate between multiple language extensions
|
414
|
-
disambiguate_extension_language ||
|
415
|
-
|
416
|
-
# See if there is a Language for the extension
|
417
|
-
Language.find_by_filename(name.to_s) ||
|
418
|
-
|
419
|
-
# Try to detect Language from shebang line
|
420
|
-
shebang_language
|
421
|
-
end
|
422
|
-
|
423
269
|
# Internal: Get the lexer of the blob.
|
424
270
|
#
|
425
271
|
# Returns a Lexer.
|
@@ -427,86 +273,6 @@ module Linguist
|
|
427
273
|
language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
|
428
274
|
end
|
429
275
|
|
430
|
-
# Internal: Disambiguates between multiple language extensions.
|
431
|
-
#
|
432
|
-
# Returns a Language or nil.
|
433
|
-
def disambiguate_extension_language
|
434
|
-
if Language.ambiguous?(extname)
|
435
|
-
possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
|
436
|
-
if possible_languages.any?
|
437
|
-
if result = Classifier.classify(Samples::DATA, data, possible_languages).first
|
438
|
-
Language[result[0]]
|
439
|
-
end
|
440
|
-
end
|
441
|
-
end
|
442
|
-
end
|
443
|
-
|
444
|
-
# Internal: Extract the script name from the shebang line
|
445
|
-
#
|
446
|
-
# Requires Blob#data
|
447
|
-
#
|
448
|
-
# Examples
|
449
|
-
#
|
450
|
-
# '#!/usr/bin/ruby'
|
451
|
-
# # => 'ruby'
|
452
|
-
#
|
453
|
-
# '#!/usr/bin/env ruby'
|
454
|
-
# # => 'ruby'
|
455
|
-
#
|
456
|
-
# '#!/usr/bash/python2.4'
|
457
|
-
# # => 'python'
|
458
|
-
#
|
459
|
-
# Please add additional test coverage to
|
460
|
-
# `test/test_blob.rb#test_shebang_script` if you make any changes.
|
461
|
-
#
|
462
|
-
# Returns a script name String or nil
|
463
|
-
def shebang_script
|
464
|
-
# Fail fast if blob isn't viewable?
|
465
|
-
return unless viewable?
|
466
|
-
|
467
|
-
if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
|
468
|
-
bang.sub!(/^#! /, '#!')
|
469
|
-
tokens = bang.split(' ')
|
470
|
-
pieces = tokens.first.split('/')
|
471
|
-
if pieces.size > 1
|
472
|
-
script = pieces.last
|
473
|
-
else
|
474
|
-
script = pieces.first.sub('#!', '')
|
475
|
-
end
|
476
|
-
|
477
|
-
script = script == 'env' ? tokens[1] : script
|
478
|
-
|
479
|
-
# python2.4 => python
|
480
|
-
if script =~ /((?:\d+\.?)+)/
|
481
|
-
script.sub! $1, ''
|
482
|
-
end
|
483
|
-
|
484
|
-
# Check for multiline shebang hacks that exec themselves
|
485
|
-
#
|
486
|
-
# #!/bin/sh
|
487
|
-
# exec foo "$0" "$@"
|
488
|
-
#
|
489
|
-
if script == 'sh' &&
|
490
|
-
lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
|
491
|
-
script = $1
|
492
|
-
end
|
493
|
-
|
494
|
-
script
|
495
|
-
end
|
496
|
-
end
|
497
|
-
|
498
|
-
# Internal: Get Language for shebang script
|
499
|
-
#
|
500
|
-
# Returns the Language or nil
|
501
|
-
def shebang_language
|
502
|
-
# Skip file extensions unlikely to have shebangs
|
503
|
-
return unless shebang_extname?
|
504
|
-
|
505
|
-
if script = shebang_script
|
506
|
-
Language[script]
|
507
|
-
end
|
508
|
-
end
|
509
|
-
|
510
276
|
# Public: Highlight syntax of blob
|
511
277
|
#
|
512
278
|
# options - A Hash of options (defaults to {})
|
@@ -0,0 +1,161 @@
|
|
1
|
+
module Linguist
|
2
|
+
class Generated
|
3
|
+
# Public: Is the blob a generated file?
|
4
|
+
#
|
5
|
+
# name - String filename
|
6
|
+
# data - String blob data. A block also maybe passed in for lazy
|
7
|
+
# loading. This behavior is deprecated and you should always
|
8
|
+
# pass in a String.
|
9
|
+
#
|
10
|
+
# Return true or false
|
11
|
+
def self.generated?(name, data)
|
12
|
+
new(name, data).generated?
|
13
|
+
end
|
14
|
+
|
15
|
+
# Internal: Initialize Generated instance
|
16
|
+
#
|
17
|
+
# name - String filename
|
18
|
+
# data - String blob data
|
19
|
+
def initialize(name, data)
|
20
|
+
@name = name
|
21
|
+
@extname = File.extname(name)
|
22
|
+
@_data = data
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_reader :name, :extname
|
26
|
+
|
27
|
+
# Lazy load blob data if block was passed in.
|
28
|
+
#
|
29
|
+
# Awful, awful stuff happening here.
|
30
|
+
#
|
31
|
+
# Returns String data.
|
32
|
+
def data
|
33
|
+
@data ||= @_data.respond_to?(:call) ? @_data.call() : @_data
|
34
|
+
end
|
35
|
+
|
36
|
+
# Public: Get each line of data
|
37
|
+
#
|
38
|
+
# Returns an Array of lines
|
39
|
+
def lines
|
40
|
+
@lines ||= data.split("\n", -1)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Internal: Is the blob a generated file?
|
44
|
+
#
|
45
|
+
# Generated source code is supressed in diffs and is ignored by
|
46
|
+
# language statistics.
|
47
|
+
#
|
48
|
+
# Please add additional test coverage to
|
49
|
+
# `test/test_blob.rb#test_generated` if you make any changes.
|
50
|
+
#
|
51
|
+
# Return true or false
|
52
|
+
def generated?
|
53
|
+
name == 'Gemfile.lock' ||
|
54
|
+
minified_javascript? ||
|
55
|
+
compiled_coffeescript? ||
|
56
|
+
xcode_project_file? ||
|
57
|
+
generated_net_docfile? ||
|
58
|
+
generated_parser?
|
59
|
+
end
|
60
|
+
|
61
|
+
# Internal: Is the blob an XCode project file?
|
62
|
+
#
|
63
|
+
# Generated if the file extension is an XCode project
|
64
|
+
# file extension.
|
65
|
+
#
|
66
|
+
# Returns true of false.
|
67
|
+
def xcode_project_file?
|
68
|
+
['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Internal: Is the blob minified JS?
|
72
|
+
#
|
73
|
+
# Consider JS minified if the average line length is
|
74
|
+
# greater then 100c.
|
75
|
+
#
|
76
|
+
# Returns true or false.
|
77
|
+
def minified_javascript?
|
78
|
+
return unless extname == '.js'
|
79
|
+
if lines.any?
|
80
|
+
(lines.inject(0) { |n, l| n += l.length } / lines.length) > 100
|
81
|
+
else
|
82
|
+
false
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# Internal: Is the blob of JS generated by CoffeeScript?
|
87
|
+
#
|
88
|
+
# CoffeScript is meant to output JS that would be difficult to
|
89
|
+
# tell if it was generated or not. Look for a number of patterns
|
90
|
+
# output by the CS compiler.
|
91
|
+
#
|
92
|
+
# Return true or false
|
93
|
+
def compiled_coffeescript?
|
94
|
+
return false unless extname == '.js'
|
95
|
+
|
96
|
+
# CoffeeScript generated by > 1.2 include a comment on the first line
|
97
|
+
if lines[0] =~ /^\/\/ Generated by /
|
98
|
+
return true
|
99
|
+
end
|
100
|
+
|
101
|
+
if lines[0] == '(function() {' && # First line is module closure opening
|
102
|
+
lines[-2] == '}).call(this);' && # Second to last line closes module closure
|
103
|
+
lines[-1] == '' # Last line is blank
|
104
|
+
|
105
|
+
score = 0
|
106
|
+
|
107
|
+
lines.each do |line|
|
108
|
+
if line =~ /var /
|
109
|
+
# Underscored temp vars are likely to be Coffee
|
110
|
+
score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
|
111
|
+
|
112
|
+
# bind and extend functions are very Coffee specific
|
113
|
+
score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
# Require a score of 3. This is fairly arbitrary. Consider
|
118
|
+
# tweaking later.
|
119
|
+
score >= 3
|
120
|
+
else
|
121
|
+
false
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Internal: Is this a generated documentation file for a .NET assembly?
|
126
|
+
#
|
127
|
+
# .NET developers often check in the XML Intellisense file along with an
|
128
|
+
# assembly - however, these don't have a special extension, so we have to
|
129
|
+
# dig into the contents to determine if it's a docfile. Luckily, these files
|
130
|
+
# are extremely structured, so recognizing them is easy.
|
131
|
+
#
|
132
|
+
# Returns true or false
|
133
|
+
def generated_net_docfile?
|
134
|
+
return false unless extname.downcase == ".xml"
|
135
|
+
return false unless lines.count > 3
|
136
|
+
|
137
|
+
# .NET Docfiles always open with <doc> and their first tag is an
|
138
|
+
# <assembly> tag
|
139
|
+
return lines[1].include?("<doc>") &&
|
140
|
+
lines[2].include?("<assembly>") &&
|
141
|
+
lines[-2].include?("</doc>")
|
142
|
+
end
|
143
|
+
|
144
|
+
# Internal: Is the blob of JS a parser generated by PEG.js?
|
145
|
+
#
|
146
|
+
# PEG.js-generated parsers are not meant to be consumed by humans.
|
147
|
+
#
|
148
|
+
# Return true or false
|
149
|
+
def generated_parser?
|
150
|
+
return false unless extname == '.js'
|
151
|
+
|
152
|
+
# PEG.js-generated parsers include a comment near the top of the file
|
153
|
+
# that marks them as such.
|
154
|
+
if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
|
155
|
+
return true
|
156
|
+
end
|
157
|
+
|
158
|
+
false
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|