github-linguist 2.1.2 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,5 @@
1
1
  require 'linguist/blob_helper'
2
+ require 'linguist/generated'
2
3
  require 'linguist/language'
3
4
  require 'linguist/mime'
4
5
  require 'linguist/repository'
@@ -1,7 +1,6 @@
1
- require 'linguist/classifier'
1
+ require 'linguist/generated'
2
2
  require 'linguist/language'
3
3
  require 'linguist/mime'
4
- require 'linguist/samples'
5
4
 
6
5
  require 'charlock_holmes'
7
6
  require 'escape_utils'
@@ -129,15 +128,6 @@ module Linguist
129
128
  ['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
130
129
  end
131
130
 
132
- # Public: Is the blob likely to have a shebang?
133
- #
134
- # Return true or false
135
- def shebang_extname?
136
- extname.empty? &&
137
- mode &&
138
- (mode.to_i(8) & 05) == 05
139
- end
140
-
141
131
  MEGABYTE = 1024 * 1024
142
132
 
143
133
  # Public: Is the blob too big to load?
@@ -221,143 +211,16 @@ module Linguist
221
211
  lines.grep(/\S/).size
222
212
  end
223
213
 
224
- # Internal: Compute average line length.
225
- #
226
- # Returns Integer.
227
- def average_line_length
228
- if lines.any?
229
- lines.inject(0) { |n, l| n += l.length } / lines.length
230
- else
231
- 0
232
- end
233
- end
234
-
235
214
  # Public: Is the blob a generated file?
236
215
  #
237
216
  # Generated source code is supressed in diffs and is ignored by
238
217
  # language statistics.
239
218
  #
240
- # Requires Blob#data
241
- #
242
- # Includes:
243
- # - XCode project XML files
244
- # - Minified JavaScript
245
- # - Compiled CoffeeScript
246
- # - PEG.js-generated parsers
247
- #
248
- # Please add additional test coverage to
249
- # `test/test_blob.rb#test_generated` if you make any changes.
219
+ # May load Blob#data
250
220
  #
251
221
  # Return true or false
252
222
  def generated?
253
- if name == 'Gemfile.lock' || minified_javascript? || compiled_coffeescript? ||
254
- xcode_project_file? || generated_net_docfile? || generated_parser?
255
- true
256
- else
257
- false
258
- end
259
- end
260
-
261
- # Internal: Is the blob an XCode project file?
262
- #
263
- # Generated if the file extension is an XCode project
264
- # file extension.
265
- #
266
- # Returns true of false.
267
- def xcode_project_file?
268
- ['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
269
- end
270
-
271
- # Internal: Is the blob minified JS?
272
- #
273
- # Consider JS minified if the average line length is
274
- # greater then 100c.
275
- #
276
- # Returns true or false.
277
- def minified_javascript?
278
- return unless extname == '.js'
279
- average_line_length > 100
280
- end
281
-
282
- # Internal: Is the blob of JS a parser generated by PEG.js?
283
- #
284
- # Requires Blob#data
285
- #
286
- # PEG.js-generated parsers are not meant to be consumed by humans.
287
- #
288
- # Return true or false
289
- def generated_parser?
290
- return false unless extname == '.js'
291
-
292
- # PEG.js-generated parsers include a comment near the top of the file
293
- # that marks them as such.
294
- if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
295
- return true
296
- end
297
-
298
- false
299
- end
300
-
301
- # Internal: Is the blob of JS generated by CoffeeScript?
302
- #
303
- # Requires Blob#data
304
- #
305
- # CoffeScript is meant to output JS that would be difficult to
306
- # tell if it was generated or not. Look for a number of patterns
307
- # output by the CS compiler.
308
- #
309
- # Return true or false
310
- def compiled_coffeescript?
311
- return false unless extname == '.js'
312
-
313
- # CoffeeScript generated by > 1.2 include a comment on the first line
314
- if lines[0] =~ /^\/\/ Generated by /
315
- return true
316
- end
317
-
318
- if lines[0] == '(function() {' && # First line is module closure opening
319
- lines[-2] == '}).call(this);' && # Second to last line closes module closure
320
- lines[-1] == '' # Last line is blank
321
-
322
- score = 0
323
-
324
- lines.each do |line|
325
- if line =~ /var /
326
- # Underscored temp vars are likely to be Coffee
327
- score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
328
-
329
- # bind and extend functions are very Coffee specific
330
- score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
331
- end
332
- end
333
-
334
- # Require a score of 3. This is fairly arbitrary. Consider
335
- # tweaking later.
336
- score >= 3
337
- else
338
- false
339
- end
340
- end
341
-
342
- # Internal: Is this a generated documentation file for a .NET assembly?
343
- #
344
- # Requires Blob#data
345
- #
346
- # .NET developers often check in the XML Intellisense file along with an
347
- # assembly - however, these don't have a special extension, so we have to
348
- # dig into the contents to determine if it's a docfile. Luckily, these files
349
- # are extremely structured, so recognizing them is easy.
350
- #
351
- # Returns true or false
352
- def generated_net_docfile?
353
- return false unless extname.downcase == ".xml"
354
- return false unless lines.count > 3
355
-
356
- # .NET Docfiles always open with <doc> and their first tag is an
357
- # <assembly> tag
358
- return lines[1].include?("<doc>") &&
359
- lines[2].include?("<assembly>") &&
360
- lines[-2].include?("</doc>")
223
+ @_generated ||= Generated.generated?(name, lambda { data })
361
224
  end
362
225
 
363
226
  # Public: Should the blob be indexed for searching?
@@ -375,6 +238,8 @@ module Linguist
375
238
  def indexable?
376
239
  if binary?
377
240
  false
241
+ elsif extname == '.txt'
242
+ true
378
243
  elsif language.nil?
379
244
  false
380
245
  elsif !language.searchable?
@@ -396,30 +261,11 @@ module Linguist
396
261
  def language
397
262
  if defined? @language
398
263
  @language
399
- else
400
- @language = guess_language
264
+ elsif !binary_mime_type?
265
+ @language = Language.detect(name.to_s, lambda { data }, mode)
401
266
  end
402
267
  end
403
268
 
404
- # Internal: Guess language
405
- #
406
- # Please add additional test coverage to
407
- # `test/test_blob.rb#test_language` if you make any changes.
408
- #
409
- # Returns a Language or nil
410
- def guess_language
411
- return if binary_mime_type?
412
-
413
- # Disambiguate between multiple language extensions
414
- disambiguate_extension_language ||
415
-
416
- # See if there is a Language for the extension
417
- Language.find_by_filename(name.to_s) ||
418
-
419
- # Try to detect Language from shebang line
420
- shebang_language
421
- end
422
-
423
269
  # Internal: Get the lexer of the blob.
424
270
  #
425
271
  # Returns a Lexer.
@@ -427,86 +273,6 @@ module Linguist
427
273
  language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
428
274
  end
429
275
 
430
- # Internal: Disambiguates between multiple language extensions.
431
- #
432
- # Returns a Language or nil.
433
- def disambiguate_extension_language
434
- if Language.ambiguous?(extname)
435
- possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
436
- if possible_languages.any?
437
- if result = Classifier.classify(Samples::DATA, data, possible_languages).first
438
- Language[result[0]]
439
- end
440
- end
441
- end
442
- end
443
-
444
- # Internal: Extract the script name from the shebang line
445
- #
446
- # Requires Blob#data
447
- #
448
- # Examples
449
- #
450
- # '#!/usr/bin/ruby'
451
- # # => 'ruby'
452
- #
453
- # '#!/usr/bin/env ruby'
454
- # # => 'ruby'
455
- #
456
- # '#!/usr/bash/python2.4'
457
- # # => 'python'
458
- #
459
- # Please add additional test coverage to
460
- # `test/test_blob.rb#test_shebang_script` if you make any changes.
461
- #
462
- # Returns a script name String or nil
463
- def shebang_script
464
- # Fail fast if blob isn't viewable?
465
- return unless viewable?
466
-
467
- if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
468
- bang.sub!(/^#! /, '#!')
469
- tokens = bang.split(' ')
470
- pieces = tokens.first.split('/')
471
- if pieces.size > 1
472
- script = pieces.last
473
- else
474
- script = pieces.first.sub('#!', '')
475
- end
476
-
477
- script = script == 'env' ? tokens[1] : script
478
-
479
- # python2.4 => python
480
- if script =~ /((?:\d+\.?)+)/
481
- script.sub! $1, ''
482
- end
483
-
484
- # Check for multiline shebang hacks that exec themselves
485
- #
486
- # #!/bin/sh
487
- # exec foo "$0" "$@"
488
- #
489
- if script == 'sh' &&
490
- lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
491
- script = $1
492
- end
493
-
494
- script
495
- end
496
- end
497
-
498
- # Internal: Get Language for shebang script
499
- #
500
- # Returns the Language or nil
501
- def shebang_language
502
- # Skip file extensions unlikely to have shebangs
503
- return unless shebang_extname?
504
-
505
- if script = shebang_script
506
- Language[script]
507
- end
508
- end
509
-
510
276
  # Public: Highlight syntax of blob
511
277
  #
512
278
  # options - A Hash of options (defaults to {})
@@ -0,0 +1,161 @@
1
+ module Linguist
2
+ class Generated
3
+ # Public: Is the blob a generated file?
4
+ #
5
+ # name - String filename
6
+ # data - String blob data. A block also maybe passed in for lazy
7
+ # loading. This behavior is deprecated and you should always
8
+ # pass in a String.
9
+ #
10
+ # Return true or false
11
+ def self.generated?(name, data)
12
+ new(name, data).generated?
13
+ end
14
+
15
+ # Internal: Initialize Generated instance
16
+ #
17
+ # name - String filename
18
+ # data - String blob data
19
+ def initialize(name, data)
20
+ @name = name
21
+ @extname = File.extname(name)
22
+ @_data = data
23
+ end
24
+
25
+ attr_reader :name, :extname
26
+
27
+ # Lazy load blob data if block was passed in.
28
+ #
29
+ # Awful, awful stuff happening here.
30
+ #
31
+ # Returns String data.
32
+ def data
33
+ @data ||= @_data.respond_to?(:call) ? @_data.call() : @_data
34
+ end
35
+
36
+ # Public: Get each line of data
37
+ #
38
+ # Returns an Array of lines
39
+ def lines
40
+ @lines ||= data.split("\n", -1)
41
+ end
42
+
43
+ # Internal: Is the blob a generated file?
44
+ #
45
+ # Generated source code is supressed in diffs and is ignored by
46
+ # language statistics.
47
+ #
48
+ # Please add additional test coverage to
49
+ # `test/test_blob.rb#test_generated` if you make any changes.
50
+ #
51
+ # Return true or false
52
+ def generated?
53
+ name == 'Gemfile.lock' ||
54
+ minified_javascript? ||
55
+ compiled_coffeescript? ||
56
+ xcode_project_file? ||
57
+ generated_net_docfile? ||
58
+ generated_parser?
59
+ end
60
+
61
+ # Internal: Is the blob an XCode project file?
62
+ #
63
+ # Generated if the file extension is an XCode project
64
+ # file extension.
65
+ #
66
+ # Returns true of false.
67
+ def xcode_project_file?
68
+ ['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
69
+ end
70
+
71
+ # Internal: Is the blob minified JS?
72
+ #
73
+ # Consider JS minified if the average line length is
74
+ # greater then 100c.
75
+ #
76
+ # Returns true or false.
77
+ def minified_javascript?
78
+ return unless extname == '.js'
79
+ if lines.any?
80
+ (lines.inject(0) { |n, l| n += l.length } / lines.length) > 100
81
+ else
82
+ false
83
+ end
84
+ end
85
+
86
+ # Internal: Is the blob of JS generated by CoffeeScript?
87
+ #
88
+ # CoffeScript is meant to output JS that would be difficult to
89
+ # tell if it was generated or not. Look for a number of patterns
90
+ # output by the CS compiler.
91
+ #
92
+ # Return true or false
93
+ def compiled_coffeescript?
94
+ return false unless extname == '.js'
95
+
96
+ # CoffeeScript generated by > 1.2 include a comment on the first line
97
+ if lines[0] =~ /^\/\/ Generated by /
98
+ return true
99
+ end
100
+
101
+ if lines[0] == '(function() {' && # First line is module closure opening
102
+ lines[-2] == '}).call(this);' && # Second to last line closes module closure
103
+ lines[-1] == '' # Last line is blank
104
+
105
+ score = 0
106
+
107
+ lines.each do |line|
108
+ if line =~ /var /
109
+ # Underscored temp vars are likely to be Coffee
110
+ score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
111
+
112
+ # bind and extend functions are very Coffee specific
113
+ score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
114
+ end
115
+ end
116
+
117
+ # Require a score of 3. This is fairly arbitrary. Consider
118
+ # tweaking later.
119
+ score >= 3
120
+ else
121
+ false
122
+ end
123
+ end
124
+
125
+ # Internal: Is this a generated documentation file for a .NET assembly?
126
+ #
127
+ # .NET developers often check in the XML Intellisense file along with an
128
+ # assembly - however, these don't have a special extension, so we have to
129
+ # dig into the contents to determine if it's a docfile. Luckily, these files
130
+ # are extremely structured, so recognizing them is easy.
131
+ #
132
+ # Returns true or false
133
+ def generated_net_docfile?
134
+ return false unless extname.downcase == ".xml"
135
+ return false unless lines.count > 3
136
+
137
+ # .NET Docfiles always open with <doc> and their first tag is an
138
+ # <assembly> tag
139
+ return lines[1].include?("<doc>") &&
140
+ lines[2].include?("<assembly>") &&
141
+ lines[-2].include?("</doc>")
142
+ end
143
+
144
+ # Internal: Is the blob of JS a parser generated by PEG.js?
145
+ #
146
+ # PEG.js-generated parsers are not meant to be consumed by humans.
147
+ #
148
+ # Return true or false
149
+ def generated_parser?
150
+ return false unless extname == '.js'
151
+
152
+ # PEG.js-generated parsers include a comment near the top of the file
153
+ # that marks them as such.
154
+ if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
155
+ return true
156
+ end
157
+
158
+ false
159
+ end
160
+ end
161
+ end