github-linguist 2.0.1 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/bin/linguist CHANGED
@@ -28,7 +28,7 @@ elsif File.file?(path)
28
28
  puts " language: #{blob.language}"
29
29
 
30
30
  if blob.large?
31
- puts " blob is to large to be shown"
31
+ puts " blob is too large to be shown"
32
32
  end
33
33
 
34
34
  if blob.generated?
data/lib/linguist.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'linguist/blob_helper'
2
2
  require 'linguist/language'
3
3
  require 'linguist/mime'
4
- require 'linguist/pathname'
5
4
  require 'linguist/repository'
5
+ require 'linguist/samples'
@@ -1,7 +1,7 @@
1
1
  require 'linguist/classifier'
2
2
  require 'linguist/language'
3
3
  require 'linguist/mime'
4
- require 'linguist/pathname'
4
+ require 'linguist/samples'
5
5
 
6
6
  require 'charlock_holmes'
7
7
  require 'escape_utils'
@@ -12,13 +12,6 @@ module Linguist
12
12
  # BlobHelper is a mixin for Blobish classes that respond to "name",
13
13
  # "data" and "size" such as Grit::Blob.
14
14
  module BlobHelper
15
- # Internal: Get a Pathname wrapper for Blob#name
16
- #
17
- # Returns a Pathname.
18
- def pathname
19
- Pathname.new(name || "")
20
- end
21
-
22
15
  # Public: Get the extname of the path
23
16
  #
24
17
  # Examples
@@ -28,7 +21,7 @@ module Linguist
28
21
  #
29
22
  # Returns a String
30
23
  def extname
31
- pathname.extname
24
+ File.extname(name)
32
25
  end
33
26
 
34
27
  # Public: Get the actual blob mime type
@@ -40,7 +33,7 @@ module Linguist
40
33
  #
41
34
  # Returns a mime type String.
42
35
  def mime_type
43
- @mime_type ||= pathname.mime_type
36
+ @mime_type ||= Mime.mime_for(extname)
44
37
  end
45
38
 
46
39
  # Public: Get the Content-Type header value
@@ -72,7 +65,7 @@ module Linguist
72
65
  elsif name.nil?
73
66
  "attachment"
74
67
  else
75
- "attachment; filename=#{EscapeUtils.escape_url(pathname.basename)}"
68
+ "attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
76
69
  end
77
70
  end
78
71
 
@@ -95,7 +88,7 @@ module Linguist
95
88
  #
96
89
  # Return true or false
97
90
  def binary_mime_type?
98
- if mime_type = Mime.lookup_mime_type_for(pathname.extname)
91
+ if mime_type = Mime.lookup_mime_type_for(extname)
99
92
  mime_type.binary?
100
93
  end
101
94
  end
@@ -136,13 +129,6 @@ module Linguist
136
129
  ['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
137
130
  end
138
131
 
139
- # Public: Is the blob a possible drupal php file?
140
- #
141
- # Return true or false
142
- def drupal_extname?
143
- ['.module', '.install', '.test', '.inc'].include?(extname)
144
- end
145
-
146
132
  # Public: Is the blob likely to have a shebang?
147
133
  #
148
134
  # Return true or false
@@ -428,10 +414,7 @@ module Linguist
428
414
  disambiguate_extension_language ||
429
415
 
430
416
  # See if there is a Language for the extension
431
- pathname.language ||
432
-
433
- # Look for idioms in first line
434
- first_line_language ||
417
+ Language.find_by_filename(name) ||
435
418
 
436
419
  # Try to detect Language from shebang line
437
420
  shebang_language
@@ -446,179 +429,18 @@ module Linguist
446
429
 
447
430
  # Internal: Disambiguates between multiple language extensions.
448
431
  #
449
- # Delegates to "guess_EXTENSION_language".
450
- #
451
- # Please add additional test coverage to
452
- # `test/test_blob.rb#test_language` if you add another method.
453
- #
454
432
  # Returns a Language or nil.
455
433
  def disambiguate_extension_language
456
434
  if Language.ambiguous?(extname)
457
- # name = "guess_#{extname.sub(/^\./, '')}_language"
458
- # send(name) if respond_to?(name)
459
-
460
- possible_languages = Language.all.select { |l| l.extensions.include?(extname) }
435
+ possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
461
436
  if possible_languages.any?
462
- if result = Classifier.instance.classify(data, possible_languages).first
463
- result[0]
437
+ if result = Classifier.classify(Samples::DATA, data, possible_languages).first
438
+ Language[result[0]]
464
439
  end
465
440
  end
466
441
  end
467
442
  end
468
443
 
469
- # Internal: Guess language of .cls files
470
- #
471
- # Returns a Language.
472
- def guess_cls_language
473
- if lines.grep(/^(%|\\)/).any?
474
- Language['TeX']
475
- elsif lines.grep(/^\s*(CLASS|METHOD|INTERFACE).*:\s*/i).any? || lines.grep(/^\s*(USING|DEFINE)/i).any?
476
- Language['OpenEdge ABL']
477
- elsif lines.grep(/\{$/).any? || lines.grep(/\}$/).any?
478
- Language['Apex']
479
- elsif lines.grep(/^(\'\*|Attribute|Option|Sub|Private|Protected|Public|Friend)/i).any?
480
- Language['Visual Basic']
481
- else
482
- # The most common language should be the fallback
483
- Language['TeX']
484
- end
485
- end
486
-
487
- # Internal: Guess language of header files (.h).
488
- #
489
- # Returns a Language.
490
- def guess_h_language
491
- if lines.grep(/^@(interface|property|private|public|end)/).any?
492
- Language['Objective-C']
493
- elsif lines.grep(/^class |^\s+(public|protected|private):/).any?
494
- Language['C++']
495
- else
496
- Language['C']
497
- end
498
- end
499
-
500
- # Internal: Guess language of .m files.
501
- #
502
- # Objective-C heuristics:
503
- # * Keywords ("#import", "#include", "#ifdef", #define, "@end") or "//" and opening "\*" comments
504
- #
505
- # Matlab heuristics:
506
- # * Leading "function " of "classdef " keyword
507
- # * "%" comments
508
- #
509
- # Note: All "#" keywords, e.g., "#import", are guaranteed to be Objective-C. Because the ampersand
510
- # is used to created function handles and anonymous functions in Matlab, most "@" keywords are not
511
- # safe heuristics. However, "end" is a reserved term in Matlab and can't be used to create a valid
512
- # function handle. Because @end is required to close any @implementation, @property, @interface,
513
- # @synthesize, etc. directive in Objective-C, only @end needs to be checked for.
514
- #
515
- # Returns a Language.
516
- def guess_m_language
517
- # Objective-C keywords or comments
518
- if lines.grep(/^#(import|include|ifdef|define)|@end/).any? || lines.grep(/^\s*\/\//).any? || lines.grep(/^\s*\/\*/).any?
519
- Language['Objective-C']
520
-
521
- # Matlab file function or class or comments
522
- elsif lines.any? && lines.first.match(/^\s*(function |classdef )/) || lines.grep(/^\s*%/).any?
523
- Language['Matlab']
524
-
525
- # Fallback to Objective-C, don't want any Matlab false positives
526
- else
527
- Language['Objective-C']
528
- end
529
- end
530
-
531
- # Internal: Guess language of .pl files
532
- #
533
- # The rules for disambiguation are:
534
- #
535
- # 1. Many perl files begin with a shebang
536
- # 2. Most Prolog source files have a rule somewhere (marked by the :- operator)
537
- # 3. Default to Perl, because it is more popular
538
- #
539
- # Returns a Language.
540
- def guess_pl_language
541
- if shebang_script == 'perl'
542
- Language['Perl']
543
- elsif lines.grep(/:-/).any?
544
- Language['Prolog']
545
- else
546
- Language['Perl']
547
- end
548
- end
549
-
550
- # Internal: Guess language of .r files.
551
- #
552
- # Returns a Language.
553
- def guess_r_language
554
- if lines.grep(/(rebol|(:\s+func|make\s+object!|^\s*context)\s*\[)/i).any?
555
- Language['Rebol']
556
- else
557
- Language['R']
558
- end
559
- end
560
-
561
- # Internal: Guess language of .t files.
562
- #
563
- # Returns a Language.
564
- def guess_t_language
565
- score = 0
566
- score += 1 if lines.grep(/^% /).any?
567
- score += data.gsub(/ := /).count
568
- score += data.gsub(/proc |procedure |fcn |function /).count
569
- score += data.gsub(/var \w+: \w+/).count
570
-
571
- # Tell-tale signs its gotta be Perl
572
- if lines.grep(/^(my )?(sub |\$|@|%)\w+/).any?
573
- score = 0
574
- end
575
-
576
- if score >= 3
577
- Language['Turing']
578
- else
579
- Language['Perl']
580
- end
581
- end
582
-
583
- # Internal: Guess language of .v files.
584
- #
585
- # Returns a Language
586
- def guess_v_language
587
- if lines.grep(/^(\/\*|\/\/|module|parameter|input|output|wire|reg|always|initial|begin|\`)/).any?
588
- Language['Verilog']
589
- else
590
- Language['Coq']
591
- end
592
- end
593
-
594
- # Internal: Guess language of .gsp files.
595
- #
596
- # Returns a Language.
597
- def guess_gsp_language
598
- if lines.grep(/<%|<%@|\$\{|<%|<g:|<meta name="layout"|<r:/).any?
599
- Language['Groovy Server Pages']
600
- else
601
- Language['Gosu']
602
- end
603
- end
604
-
605
- # Internal: Guess language from the first line.
606
- #
607
- # Look for leading "<?php" in Drupal files
608
- #
609
- # Returns a Language.
610
- def first_line_language
611
- # Only check files with drupal php extensions
612
- return unless drupal_extname?
613
-
614
- # Fail fast if blob isn't viewable?
615
- return unless viewable?
616
-
617
- if lines.first.to_s =~ /^<\?php/
618
- Language['PHP']
619
- end
620
- end
621
-
622
444
  # Internal: Extract the script name from the shebang line
623
445
  #
624
446
  # Requires Blob#data
@@ -710,12 +532,5 @@ module Linguist
710
532
  ''
711
533
  end
712
534
  end
713
-
714
- Language.overridden_extensions.each do |extension|
715
- name = "guess_#{extension.sub(/^\./, '')}_language".to_sym
716
- unless instance_methods.map(&:to_sym).include?(name)
717
- raise NotImplementedError, "Language##{name} was not defined"
718
- end
719
- end
720
535
  end
721
536
  end
@@ -1,113 +1,88 @@
1
- require 'linguist/language'
2
1
  require 'linguist/tokenizer'
3
2
 
4
3
  module Linguist
5
4
  # Language bayesian classifier.
6
5
  class Classifier
7
- # Internal: Path to persisted classifier db.
8
- PATH = File.expand_path('../classifier.yml', __FILE__)
9
-
10
- # Public: Check if persisted db exists on disk.
11
- #
12
- # Returns Boolean.
13
- def self.exist?
14
- File.exist?(PATH)
15
- end
16
-
17
- # Public: Get persisted Classifier instance.
18
- #
19
- # Returns Classifier.
20
- def self.instance
21
- @instance ||= YAML.load_file(PATH)
22
- end
23
-
24
- # Public: Initialize a Classifier.
25
- def initialize
26
- @tokens_total = 0
27
- @languages_total = 0
28
- @tokens = Hash.new { |h, k| h[k] = Hash.new(0) }
29
- @language_tokens = Hash.new(0)
30
- @languages = Hash.new(0)
31
- end
32
-
33
- # Public: Compare Classifier objects.
34
- #
35
- # other - Classifier object to compare to.
36
- #
37
- # Returns Boolean.
38
- def eql?(other)
39
- # Lazy fast check counts only
40
- other.is_a?(self.class) &&
41
- @tokens_total == other.instance_variable_get(:@tokens_total) &&
42
- @languages_total == other.instance_variable_get(:@languages_total)
43
- end
44
- alias_method :==, :eql?
45
-
46
6
  # Public: Train classifier that data is a certain language.
47
7
  #
48
- # language - Language of data
8
+ # db - Hash classifier database object
9
+ # language - String language of data
49
10
  # data - String contents of file
50
11
  #
51
12
  # Examples
52
13
  #
53
- # train(Language['Ruby'], "def hello; end")
14
+ # Classifier.train(db, 'Ruby', "def hello; end")
54
15
  #
55
16
  # Returns nothing.
56
- def train(language, data)
57
- language = language.name
58
- tokens = Tokenizer.new(data).tokens
17
+ def self.train!(db, language, data)
18
+ tokens = Tokenizer.tokenize(data)
19
+
20
+ db['tokens_total'] ||= 0
21
+ db['languages_total'] ||= 0
22
+ db['tokens'] ||= {}
23
+ db['language_tokens'] ||= {}
24
+ db['languages'] ||= {}
59
25
 
60
26
  tokens.each do |token|
61
- @tokens[language][token] += 1
62
- @language_tokens[language] += 1
63
- @tokens_total += 1
27
+ db['tokens'][language] ||= {}
28
+ db['tokens'][language][token] ||= 0
29
+ db['tokens'][language][token] += 1
30
+ db['language_tokens'][language] ||= 0
31
+ db['language_tokens'][language] += 1
32
+ db['tokens_total'] += 1
64
33
  end
65
- @languages[language] += 1
66
- @languages_total += 1
34
+ db['languages'][language] ||= 0
35
+ db['languages'][language] += 1
36
+ db['languages_total'] += 1
67
37
 
68
38
  nil
69
39
  end
70
40
 
71
- # Public: Verify internal counts are consistent.
41
+ # Public: Guess language of data.
42
+ #
43
+ # db - Hash of classifer tokens database.
44
+ # data - Array of tokens or String data to analyze.
45
+ # languages - Array of language name Strings to restrict to.
46
+ #
47
+ # Examples
72
48
  #
73
- # Returns Boolean.
74
- def verify
75
- @languages.inject(0) { |n, (l, c)| n += c } == @languages_total &&
76
- @language_tokens.inject(0) { |n, (l, c)| n += c } == @tokens_total &&
77
- @tokens.inject(0) { |n, (l, ts)| n += ts.inject(0) { |m, (t, c)| m += c } } == @tokens_total
49
+ # Classifier.classify(db, "def hello; end")
50
+ # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
51
+ #
52
+ # Returns sorted Array of result pairs. Each pair contains the
53
+ # String language name and a Float score.
54
+ def self.classify(db, tokens, languages = nil)
55
+ languages ||= db['languages'].keys
56
+ new(db).classify(tokens, languages)
78
57
  end
79
58
 
80
- # Public: Prune infrequent tokens.
81
- #
82
- # Returns receiver Classifier instance.
83
- def gc
84
- self
59
+ # Internal: Initialize a Classifier.
60
+ def initialize(db = {})
61
+ @tokens_total = db['tokens_total']
62
+ @languages_total = db['languages_total']
63
+ @tokens = db['tokens']
64
+ @language_tokens = db['language_tokens']
65
+ @languages = db['languages']
85
66
  end
86
67
 
87
- # Public: Guess language of data.
68
+ # Internal: Guess language of data
88
69
  #
89
70
  # data - Array of tokens or String data to analyze.
90
- # languages - Array of Languages to restrict to.
91
- #
92
- # Examples
93
- #
94
- # classify("def hello; end")
95
- # # => [ [Language['Ruby'], 0.90], [Language['Python'], 0.2], ... ]
71
+ # languages - Array of language name Strings to restrict to.
96
72
  #
97
73
  # Returns sorted Array of result pairs. Each pair contains the
98
- # Language and a Float score.
99
- def classify(tokens, languages = @languages.keys)
74
+ # String language name and a Float score.
75
+ def classify(tokens, languages)
100
76
  return [] if tokens.nil?
101
- tokens = Tokenizer.new(tokens).tokens if tokens.is_a?(String)
77
+ tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
102
78
 
103
79
  scores = {}
104
80
  languages.each do |language|
105
- language_name = language.is_a?(Language) ? language.name : language
106
- scores[language_name] = tokens_probability(tokens, language_name) +
107
- language_probability(language_name)
81
+ scores[language] = tokens_probability(tokens, language) +
82
+ language_probability(language)
108
83
  end
109
84
 
110
- scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] }
85
+ scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
111
86
  end
112
87
 
113
88
  # Internal: Probably of set of tokens in a language occuring - P(D | C)
@@ -144,41 +119,5 @@ module Linguist
144
119
  def language_probability(language)
145
120
  Math.log(@languages[language].to_f / @languages_total.to_f)
146
121
  end
147
-
148
- # Public: Serialize classifier to YAML.
149
- #
150
- # opts - Hash of YAML options.
151
- #
152
- # Returns nothing.
153
- def to_yaml(io)
154
- data = "--- !ruby/object:Linguist::Classifier\n"
155
-
156
- data << "languages_total: #{@languages_total}\n"
157
- data << "tokens_total: #{@tokens_total}\n"
158
-
159
- data << "languages:\n"
160
- @languages.sort.each do |language, count|
161
- data << " #{{language => count}.to_yaml.lines.to_a[1]}"
162
- end
163
-
164
- data << "language_tokens:\n"
165
- @language_tokens.sort.each do |language, count|
166
- data << " #{{language => count}.to_yaml.lines.to_a[1]}"
167
- end
168
-
169
- data << "tokens:\n"
170
- @tokens.sort.each do |language, tokens|
171
- data << " #{{language => true}.to_yaml.lines.to_a[1].sub(/ true/, "")}"
172
- tokens.sort.each do |token, count|
173
- data << " #{{token => count}.to_yaml.lines.to_a[1]}"
174
- end
175
- end
176
-
177
- io.write data
178
- nil
179
- end
180
122
  end
181
-
182
- # Eager load instance
183
- Classifier.instance if Classifier.exist?
184
123
  end