github-linguist 2.1.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  require 'linguist/blob_helper'
2
+ require 'linguist/generated'
2
3
  require 'linguist/language'
3
4
  require 'linguist/mime'
4
5
  require 'linguist/repository'
@@ -1,7 +1,6 @@
1
- require 'linguist/classifier'
1
+ require 'linguist/generated'
2
2
  require 'linguist/language'
3
3
  require 'linguist/mime'
4
- require 'linguist/samples'
5
4
 
6
5
  require 'charlock_holmes'
7
6
  require 'escape_utils'
@@ -129,15 +128,6 @@ module Linguist
129
128
  ['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
130
129
  end
131
130
 
132
- # Public: Is the blob likely to have a shebang?
133
- #
134
- # Return true or false
135
- def shebang_extname?
136
- extname.empty? &&
137
- mode &&
138
- (mode.to_i(8) & 05) == 05
139
- end
140
-
141
131
  MEGABYTE = 1024 * 1024
142
132
 
143
133
  # Public: Is the blob too big to load?
@@ -221,143 +211,16 @@ module Linguist
221
211
  lines.grep(/\S/).size
222
212
  end
223
213
 
224
- # Internal: Compute average line length.
225
- #
226
- # Returns Integer.
227
- def average_line_length
228
- if lines.any?
229
- lines.inject(0) { |n, l| n += l.length } / lines.length
230
- else
231
- 0
232
- end
233
- end
234
-
235
214
  # Public: Is the blob a generated file?
236
215
  #
237
216
  # Generated source code is supressed in diffs and is ignored by
238
217
  # language statistics.
239
218
  #
240
- # Requires Blob#data
241
- #
242
- # Includes:
243
- # - XCode project XML files
244
- # - Minified JavaScript
245
- # - Compiled CoffeeScript
246
- # - PEG.js-generated parsers
247
- #
248
- # Please add additional test coverage to
249
- # `test/test_blob.rb#test_generated` if you make any changes.
219
+ # May load Blob#data
250
220
  #
251
221
  # Return true or false
252
222
  def generated?
253
- if name == 'Gemfile.lock' || minified_javascript? || compiled_coffeescript? ||
254
- xcode_project_file? || generated_net_docfile? || generated_parser?
255
- true
256
- else
257
- false
258
- end
259
- end
260
-
261
- # Internal: Is the blob an XCode project file?
262
- #
263
- # Generated if the file extension is an XCode project
264
- # file extension.
265
- #
266
- # Returns true of false.
267
- def xcode_project_file?
268
- ['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
269
- end
270
-
271
- # Internal: Is the blob minified JS?
272
- #
273
- # Consider JS minified if the average line length is
274
- # greater then 100c.
275
- #
276
- # Returns true or false.
277
- def minified_javascript?
278
- return unless extname == '.js'
279
- average_line_length > 100
280
- end
281
-
282
- # Internal: Is the blob of JS a parser generated by PEG.js?
283
- #
284
- # Requires Blob#data
285
- #
286
- # PEG.js-generated parsers are not meant to be consumed by humans.
287
- #
288
- # Return true or false
289
- def generated_parser?
290
- return false unless extname == '.js'
291
-
292
- # PEG.js-generated parsers include a comment near the top of the file
293
- # that marks them as such.
294
- if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
295
- return true
296
- end
297
-
298
- false
299
- end
300
-
301
- # Internal: Is the blob of JS generated by CoffeeScript?
302
- #
303
- # Requires Blob#data
304
- #
305
- # CoffeScript is meant to output JS that would be difficult to
306
- # tell if it was generated or not. Look for a number of patterns
307
- # output by the CS compiler.
308
- #
309
- # Return true or false
310
- def compiled_coffeescript?
311
- return false unless extname == '.js'
312
-
313
- # CoffeeScript generated by > 1.2 include a comment on the first line
314
- if lines[0] =~ /^\/\/ Generated by /
315
- return true
316
- end
317
-
318
- if lines[0] == '(function() {' && # First line is module closure opening
319
- lines[-2] == '}).call(this);' && # Second to last line closes module closure
320
- lines[-1] == '' # Last line is blank
321
-
322
- score = 0
323
-
324
- lines.each do |line|
325
- if line =~ /var /
326
- # Underscored temp vars are likely to be Coffee
327
- score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
328
-
329
- # bind and extend functions are very Coffee specific
330
- score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
331
- end
332
- end
333
-
334
- # Require a score of 3. This is fairly arbitrary. Consider
335
- # tweaking later.
336
- score >= 3
337
- else
338
- false
339
- end
340
- end
341
-
342
- # Internal: Is this a generated documentation file for a .NET assembly?
343
- #
344
- # Requires Blob#data
345
- #
346
- # .NET developers often check in the XML Intellisense file along with an
347
- # assembly - however, these don't have a special extension, so we have to
348
- # dig into the contents to determine if it's a docfile. Luckily, these files
349
- # are extremely structured, so recognizing them is easy.
350
- #
351
- # Returns true or false
352
- def generated_net_docfile?
353
- return false unless extname.downcase == ".xml"
354
- return false unless lines.count > 3
355
-
356
- # .NET Docfiles always open with <doc> and their first tag is an
357
- # <assembly> tag
358
- return lines[1].include?("<doc>") &&
359
- lines[2].include?("<assembly>") &&
360
- lines[-2].include?("</doc>")
223
+ @_generated ||= Generated.generated?(name, lambda { data })
361
224
  end
362
225
 
363
226
  # Public: Should the blob be indexed for searching?
@@ -375,6 +238,8 @@ module Linguist
375
238
  def indexable?
376
239
  if binary?
377
240
  false
241
+ elsif extname == '.txt'
242
+ true
378
243
  elsif language.nil?
379
244
  false
380
245
  elsif !language.searchable?
@@ -396,30 +261,11 @@ module Linguist
396
261
  def language
397
262
  if defined? @language
398
263
  @language
399
- else
400
- @language = guess_language
264
+ elsif !binary_mime_type?
265
+ @language = Language.detect(name.to_s, lambda { data }, mode)
401
266
  end
402
267
  end
403
268
 
404
- # Internal: Guess language
405
- #
406
- # Please add additional test coverage to
407
- # `test/test_blob.rb#test_language` if you make any changes.
408
- #
409
- # Returns a Language or nil
410
- def guess_language
411
- return if binary_mime_type?
412
-
413
- # Disambiguate between multiple language extensions
414
- disambiguate_extension_language ||
415
-
416
- # See if there is a Language for the extension
417
- Language.find_by_filename(name.to_s) ||
418
-
419
- # Try to detect Language from shebang line
420
- shebang_language
421
- end
422
-
423
269
  # Internal: Get the lexer of the blob.
424
270
  #
425
271
  # Returns a Lexer.
@@ -427,86 +273,6 @@ module Linguist
427
273
  language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
428
274
  end
429
275
 
430
- # Internal: Disambiguates between multiple language extensions.
431
- #
432
- # Returns a Language or nil.
433
- def disambiguate_extension_language
434
- if Language.ambiguous?(extname)
435
- possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
436
- if possible_languages.any?
437
- if result = Classifier.classify(Samples::DATA, data, possible_languages).first
438
- Language[result[0]]
439
- end
440
- end
441
- end
442
- end
443
-
444
- # Internal: Extract the script name from the shebang line
445
- #
446
- # Requires Blob#data
447
- #
448
- # Examples
449
- #
450
- # '#!/usr/bin/ruby'
451
- # # => 'ruby'
452
- #
453
- # '#!/usr/bin/env ruby'
454
- # # => 'ruby'
455
- #
456
- # '#!/usr/bash/python2.4'
457
- # # => 'python'
458
- #
459
- # Please add additional test coverage to
460
- # `test/test_blob.rb#test_shebang_script` if you make any changes.
461
- #
462
- # Returns a script name String or nil
463
- def shebang_script
464
- # Fail fast if blob isn't viewable?
465
- return unless viewable?
466
-
467
- if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
468
- bang.sub!(/^#! /, '#!')
469
- tokens = bang.split(' ')
470
- pieces = tokens.first.split('/')
471
- if pieces.size > 1
472
- script = pieces.last
473
- else
474
- script = pieces.first.sub('#!', '')
475
- end
476
-
477
- script = script == 'env' ? tokens[1] : script
478
-
479
- # python2.4 => python
480
- if script =~ /((?:\d+\.?)+)/
481
- script.sub! $1, ''
482
- end
483
-
484
- # Check for multiline shebang hacks that exec themselves
485
- #
486
- # #!/bin/sh
487
- # exec foo "$0" "$@"
488
- #
489
- if script == 'sh' &&
490
- lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
491
- script = $1
492
- end
493
-
494
- script
495
- end
496
- end
497
-
498
- # Internal: Get Language for shebang script
499
- #
500
- # Returns the Language or nil
501
- def shebang_language
502
- # Skip file extensions unlikely to have shebangs
503
- return unless shebang_extname?
504
-
505
- if script = shebang_script
506
- Language[script]
507
- end
508
- end
509
-
510
276
  # Public: Highlight syntax of blob
511
277
  #
512
278
  # options - A Hash of options (defaults to {})
@@ -0,0 +1,161 @@
1
+ module Linguist
2
+ class Generated
3
+ # Public: Is the blob a generated file?
4
+ #
5
+ # name - String filename
6
+ # data - String blob data. A block also maybe passed in for lazy
7
+ # loading. This behavior is deprecated and you should always
8
+ # pass in a String.
9
+ #
10
+ # Return true or false
11
+ def self.generated?(name, data)
12
+ new(name, data).generated?
13
+ end
14
+
15
+ # Internal: Initialize Generated instance
16
+ #
17
+ # name - String filename
18
+ # data - String blob data
19
+ def initialize(name, data)
20
+ @name = name
21
+ @extname = File.extname(name)
22
+ @_data = data
23
+ end
24
+
25
+ attr_reader :name, :extname
26
+
27
+ # Lazy load blob data if block was passed in.
28
+ #
29
+ # Awful, awful stuff happening here.
30
+ #
31
+ # Returns String data.
32
+ def data
33
+ @data ||= @_data.respond_to?(:call) ? @_data.call() : @_data
34
+ end
35
+
36
+ # Public: Get each line of data
37
+ #
38
+ # Returns an Array of lines
39
+ def lines
40
+ @lines ||= data.split("\n", -1)
41
+ end
42
+
43
+ # Internal: Is the blob a generated file?
44
+ #
45
+ # Generated source code is supressed in diffs and is ignored by
46
+ # language statistics.
47
+ #
48
+ # Please add additional test coverage to
49
+ # `test/test_blob.rb#test_generated` if you make any changes.
50
+ #
51
+ # Return true or false
52
+ def generated?
53
+ name == 'Gemfile.lock' ||
54
+ minified_javascript? ||
55
+ compiled_coffeescript? ||
56
+ xcode_project_file? ||
57
+ generated_net_docfile? ||
58
+ generated_parser?
59
+ end
60
+
61
+ # Internal: Is the blob an XCode project file?
62
+ #
63
+ # Generated if the file extension is an XCode project
64
+ # file extension.
65
+ #
66
+ # Returns true of false.
67
+ def xcode_project_file?
68
+ ['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
69
+ end
70
+
71
+ # Internal: Is the blob minified JS?
72
+ #
73
+ # Consider JS minified if the average line length is
74
+ # greater then 100c.
75
+ #
76
+ # Returns true or false.
77
+ def minified_javascript?
78
+ return unless extname == '.js'
79
+ if lines.any?
80
+ (lines.inject(0) { |n, l| n += l.length } / lines.length) > 100
81
+ else
82
+ false
83
+ end
84
+ end
85
+
86
+ # Internal: Is the blob of JS generated by CoffeeScript?
87
+ #
88
+ # CoffeScript is meant to output JS that would be difficult to
89
+ # tell if it was generated or not. Look for a number of patterns
90
+ # output by the CS compiler.
91
+ #
92
+ # Return true or false
93
+ def compiled_coffeescript?
94
+ return false unless extname == '.js'
95
+
96
+ # CoffeeScript generated by > 1.2 include a comment on the first line
97
+ if lines[0] =~ /^\/\/ Generated by /
98
+ return true
99
+ end
100
+
101
+ if lines[0] == '(function() {' && # First line is module closure opening
102
+ lines[-2] == '}).call(this);' && # Second to last line closes module closure
103
+ lines[-1] == '' # Last line is blank
104
+
105
+ score = 0
106
+
107
+ lines.each do |line|
108
+ if line =~ /var /
109
+ # Underscored temp vars are likely to be Coffee
110
+ score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
111
+
112
+ # bind and extend functions are very Coffee specific
113
+ score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
114
+ end
115
+ end
116
+
117
+ # Require a score of 3. This is fairly arbitrary. Consider
118
+ # tweaking later.
119
+ score >= 3
120
+ else
121
+ false
122
+ end
123
+ end
124
+
125
+ # Internal: Is this a generated documentation file for a .NET assembly?
126
+ #
127
+ # .NET developers often check in the XML Intellisense file along with an
128
+ # assembly - however, these don't have a special extension, so we have to
129
+ # dig into the contents to determine if it's a docfile. Luckily, these files
130
+ # are extremely structured, so recognizing them is easy.
131
+ #
132
+ # Returns true or false
133
+ def generated_net_docfile?
134
+ return false unless extname.downcase == ".xml"
135
+ return false unless lines.count > 3
136
+
137
+ # .NET Docfiles always open with <doc> and their first tag is an
138
+ # <assembly> tag
139
+ return lines[1].include?("<doc>") &&
140
+ lines[2].include?("<assembly>") &&
141
+ lines[-2].include?("</doc>")
142
+ end
143
+
144
+ # Internal: Is the blob of JS a parser generated by PEG.js?
145
+ #
146
+ # PEG.js-generated parsers are not meant to be consumed by humans.
147
+ #
148
+ # Return true or false
149
+ def generated_parser?
150
+ return false unless extname == '.js'
151
+
152
+ # PEG.js-generated parsers include a comment near the top of the file
153
+ # that marks them as such.
154
+ if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
155
+ return true
156
+ end
157
+
158
+ false
159
+ end
160
+ end
161
+ end