github-linguist 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/linguist CHANGED
@@ -28,7 +28,7 @@ elsif File.file?(path)
28
28
  puts " language: #{blob.language}"
29
29
 
30
30
  if blob.large?
31
- puts " blob is to large to be shown"
31
+ puts " blob is too large to be shown"
32
32
  end
33
33
 
34
34
  if blob.generated?
data/lib/linguist.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'linguist/blob_helper'
2
2
  require 'linguist/language'
3
3
  require 'linguist/mime'
4
- require 'linguist/pathname'
5
4
  require 'linguist/repository'
5
+ require 'linguist/samples'
@@ -1,7 +1,7 @@
1
1
  require 'linguist/classifier'
2
2
  require 'linguist/language'
3
3
  require 'linguist/mime'
4
- require 'linguist/pathname'
4
+ require 'linguist/samples'
5
5
 
6
6
  require 'charlock_holmes'
7
7
  require 'escape_utils'
@@ -12,13 +12,6 @@ module Linguist
12
12
  # BlobHelper is a mixin for Blobish classes that respond to "name",
13
13
  # "data" and "size" such as Grit::Blob.
14
14
  module BlobHelper
15
- # Internal: Get a Pathname wrapper for Blob#name
16
- #
17
- # Returns a Pathname.
18
- def pathname
19
- Pathname.new(name || "")
20
- end
21
-
22
15
  # Public: Get the extname of the path
23
16
  #
24
17
  # Examples
@@ -28,7 +21,7 @@ module Linguist
28
21
  #
29
22
  # Returns a String
30
23
  def extname
31
- pathname.extname
24
+ File.extname(name)
32
25
  end
33
26
 
34
27
  # Public: Get the actual blob mime type
@@ -40,7 +33,7 @@ module Linguist
40
33
  #
41
34
  # Returns a mime type String.
42
35
  def mime_type
43
- @mime_type ||= pathname.mime_type
36
+ @mime_type ||= Mime.mime_for(extname)
44
37
  end
45
38
 
46
39
  # Public: Get the Content-Type header value
@@ -72,7 +65,7 @@ module Linguist
72
65
  elsif name.nil?
73
66
  "attachment"
74
67
  else
75
- "attachment; filename=#{EscapeUtils.escape_url(pathname.basename)}"
68
+ "attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
76
69
  end
77
70
  end
78
71
 
@@ -95,7 +88,7 @@ module Linguist
95
88
  #
96
89
  # Return true or false
97
90
  def binary_mime_type?
98
- if mime_type = Mime.lookup_mime_type_for(pathname.extname)
91
+ if mime_type = Mime.lookup_mime_type_for(extname)
99
92
  mime_type.binary?
100
93
  end
101
94
  end
@@ -136,13 +129,6 @@ module Linguist
136
129
  ['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
137
130
  end
138
131
 
139
- # Public: Is the blob a possible drupal php file?
140
- #
141
- # Return true or false
142
- def drupal_extname?
143
- ['.module', '.install', '.test', '.inc'].include?(extname)
144
- end
145
-
146
132
  # Public: Is the blob likely to have a shebang?
147
133
  #
148
134
  # Return true or false
@@ -428,10 +414,7 @@ module Linguist
428
414
  disambiguate_extension_language ||
429
415
 
430
416
  # See if there is a Language for the extension
431
- pathname.language ||
432
-
433
- # Look for idioms in first line
434
- first_line_language ||
417
+ Language.find_by_filename(name) ||
435
418
 
436
419
  # Try to detect Language from shebang line
437
420
  shebang_language
@@ -446,179 +429,18 @@ module Linguist
446
429
 
447
430
  # Internal: Disambiguates between multiple language extensions.
448
431
  #
449
- # Delegates to "guess_EXTENSION_language".
450
- #
451
- # Please add additional test coverage to
452
- # `test/test_blob.rb#test_language` if you add another method.
453
- #
454
432
  # Returns a Language or nil.
455
433
  def disambiguate_extension_language
456
434
  if Language.ambiguous?(extname)
457
- # name = "guess_#{extname.sub(/^\./, '')}_language"
458
- # send(name) if respond_to?(name)
459
-
460
- possible_languages = Language.all.select { |l| l.extensions.include?(extname) }
435
+ possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
461
436
  if possible_languages.any?
462
- if result = Classifier.instance.classify(data, possible_languages).first
463
- result[0]
437
+ if result = Classifier.classify(Samples::DATA, data, possible_languages).first
438
+ Language[result[0]]
464
439
  end
465
440
  end
466
441
  end
467
442
  end
468
443
 
469
- # Internal: Guess language of .cls files
470
- #
471
- # Returns a Language.
472
- def guess_cls_language
473
- if lines.grep(/^(%|\\)/).any?
474
- Language['TeX']
475
- elsif lines.grep(/^\s*(CLASS|METHOD|INTERFACE).*:\s*/i).any? || lines.grep(/^\s*(USING|DEFINE)/i).any?
476
- Language['OpenEdge ABL']
477
- elsif lines.grep(/\{$/).any? || lines.grep(/\}$/).any?
478
- Language['Apex']
479
- elsif lines.grep(/^(\'\*|Attribute|Option|Sub|Private|Protected|Public|Friend)/i).any?
480
- Language['Visual Basic']
481
- else
482
- # The most common language should be the fallback
483
- Language['TeX']
484
- end
485
- end
486
-
487
- # Internal: Guess language of header files (.h).
488
- #
489
- # Returns a Language.
490
- def guess_h_language
491
- if lines.grep(/^@(interface|property|private|public|end)/).any?
492
- Language['Objective-C']
493
- elsif lines.grep(/^class |^\s+(public|protected|private):/).any?
494
- Language['C++']
495
- else
496
- Language['C']
497
- end
498
- end
499
-
500
- # Internal: Guess language of .m files.
501
- #
502
- # Objective-C heuristics:
503
- # * Keywords ("#import", "#include", "#ifdef", #define, "@end") or "//" and opening "\*" comments
504
- #
505
- # Matlab heuristics:
506
- # * Leading "function " of "classdef " keyword
507
- # * "%" comments
508
- #
509
- # Note: All "#" keywords, e.g., "#import", are guaranteed to be Objective-C. Because the ampersand
510
- # is used to created function handles and anonymous functions in Matlab, most "@" keywords are not
511
- # safe heuristics. However, "end" is a reserved term in Matlab and can't be used to create a valid
512
- # function handle. Because @end is required to close any @implementation, @property, @interface,
513
- # @synthesize, etc. directive in Objective-C, only @end needs to be checked for.
514
- #
515
- # Returns a Language.
516
- def guess_m_language
517
- # Objective-C keywords or comments
518
- if lines.grep(/^#(import|include|ifdef|define)|@end/).any? || lines.grep(/^\s*\/\//).any? || lines.grep(/^\s*\/\*/).any?
519
- Language['Objective-C']
520
-
521
- # Matlab file function or class or comments
522
- elsif lines.any? && lines.first.match(/^\s*(function |classdef )/) || lines.grep(/^\s*%/).any?
523
- Language['Matlab']
524
-
525
- # Fallback to Objective-C, don't want any Matlab false positives
526
- else
527
- Language['Objective-C']
528
- end
529
- end
530
-
531
- # Internal: Guess language of .pl files
532
- #
533
- # The rules for disambiguation are:
534
- #
535
- # 1. Many perl files begin with a shebang
536
- # 2. Most Prolog source files have a rule somewhere (marked by the :- operator)
537
- # 3. Default to Perl, because it is more popular
538
- #
539
- # Returns a Language.
540
- def guess_pl_language
541
- if shebang_script == 'perl'
542
- Language['Perl']
543
- elsif lines.grep(/:-/).any?
544
- Language['Prolog']
545
- else
546
- Language['Perl']
547
- end
548
- end
549
-
550
- # Internal: Guess language of .r files.
551
- #
552
- # Returns a Language.
553
- def guess_r_language
554
- if lines.grep(/(rebol|(:\s+func|make\s+object!|^\s*context)\s*\[)/i).any?
555
- Language['Rebol']
556
- else
557
- Language['R']
558
- end
559
- end
560
-
561
- # Internal: Guess language of .t files.
562
- #
563
- # Returns a Language.
564
- def guess_t_language
565
- score = 0
566
- score += 1 if lines.grep(/^% /).any?
567
- score += data.gsub(/ := /).count
568
- score += data.gsub(/proc |procedure |fcn |function /).count
569
- score += data.gsub(/var \w+: \w+/).count
570
-
571
- # Tell-tale signs its gotta be Perl
572
- if lines.grep(/^(my )?(sub |\$|@|%)\w+/).any?
573
- score = 0
574
- end
575
-
576
- if score >= 3
577
- Language['Turing']
578
- else
579
- Language['Perl']
580
- end
581
- end
582
-
583
- # Internal: Guess language of .v files.
584
- #
585
- # Returns a Language
586
- def guess_v_language
587
- if lines.grep(/^(\/\*|\/\/|module|parameter|input|output|wire|reg|always|initial|begin|\`)/).any?
588
- Language['Verilog']
589
- else
590
- Language['Coq']
591
- end
592
- end
593
-
594
- # Internal: Guess language of .gsp files.
595
- #
596
- # Returns a Language.
597
- def guess_gsp_language
598
- if lines.grep(/<%|<%@|\$\{|<%|<g:|<meta name="layout"|<r:/).any?
599
- Language['Groovy Server Pages']
600
- else
601
- Language['Gosu']
602
- end
603
- end
604
-
605
- # Internal: Guess language from the first line.
606
- #
607
- # Look for leading "<?php" in Drupal files
608
- #
609
- # Returns a Language.
610
- def first_line_language
611
- # Only check files with drupal php extensions
612
- return unless drupal_extname?
613
-
614
- # Fail fast if blob isn't viewable?
615
- return unless viewable?
616
-
617
- if lines.first.to_s =~ /^<\?php/
618
- Language['PHP']
619
- end
620
- end
621
-
622
444
  # Internal: Extract the script name from the shebang line
623
445
  #
624
446
  # Requires Blob#data
@@ -710,12 +532,5 @@ module Linguist
710
532
  ''
711
533
  end
712
534
  end
713
-
714
- Language.overridden_extensions.each do |extension|
715
- name = "guess_#{extension.sub(/^\./, '')}_language".to_sym
716
- unless instance_methods.map(&:to_sym).include?(name)
717
- raise NotImplementedError, "Language##{name} was not defined"
718
- end
719
- end
720
535
  end
721
536
  end
@@ -1,113 +1,88 @@
1
- require 'linguist/language'
2
1
  require 'linguist/tokenizer'
3
2
 
4
3
  module Linguist
5
4
  # Language bayesian classifier.
6
5
  class Classifier
7
- # Internal: Path to persisted classifier db.
8
- PATH = File.expand_path('../classifier.yml', __FILE__)
9
-
10
- # Public: Check if persisted db exists on disk.
11
- #
12
- # Returns Boolean.
13
- def self.exist?
14
- File.exist?(PATH)
15
- end
16
-
17
- # Public: Get persisted Classifier instance.
18
- #
19
- # Returns Classifier.
20
- def self.instance
21
- @instance ||= YAML.load_file(PATH)
22
- end
23
-
24
- # Public: Initialize a Classifier.
25
- def initialize
26
- @tokens_total = 0
27
- @languages_total = 0
28
- @tokens = Hash.new { |h, k| h[k] = Hash.new(0) }
29
- @language_tokens = Hash.new(0)
30
- @languages = Hash.new(0)
31
- end
32
-
33
- # Public: Compare Classifier objects.
34
- #
35
- # other - Classifier object to compare to.
36
- #
37
- # Returns Boolean.
38
- def eql?(other)
39
- # Lazy fast check counts only
40
- other.is_a?(self.class) &&
41
- @tokens_total == other.instance_variable_get(:@tokens_total) &&
42
- @languages_total == other.instance_variable_get(:@languages_total)
43
- end
44
- alias_method :==, :eql?
45
-
46
6
  # Public: Train classifier that data is a certain language.
47
7
  #
48
- # language - Language of data
8
+ # db - Hash classifier database object
9
+ # language - String language of data
49
10
  # data - String contents of file
50
11
  #
51
12
  # Examples
52
13
  #
53
- # train(Language['Ruby'], "def hello; end")
14
+ # Classifier.train(db, 'Ruby', "def hello; end")
54
15
  #
55
16
  # Returns nothing.
56
- def train(language, data)
57
- language = language.name
58
- tokens = Tokenizer.new(data).tokens
17
+ def self.train!(db, language, data)
18
+ tokens = Tokenizer.tokenize(data)
19
+
20
+ db['tokens_total'] ||= 0
21
+ db['languages_total'] ||= 0
22
+ db['tokens'] ||= {}
23
+ db['language_tokens'] ||= {}
24
+ db['languages'] ||= {}
59
25
 
60
26
  tokens.each do |token|
61
- @tokens[language][token] += 1
62
- @language_tokens[language] += 1
63
- @tokens_total += 1
27
+ db['tokens'][language] ||= {}
28
+ db['tokens'][language][token] ||= 0
29
+ db['tokens'][language][token] += 1
30
+ db['language_tokens'][language] ||= 0
31
+ db['language_tokens'][language] += 1
32
+ db['tokens_total'] += 1
64
33
  end
65
- @languages[language] += 1
66
- @languages_total += 1
34
+ db['languages'][language] ||= 0
35
+ db['languages'][language] += 1
36
+ db['languages_total'] += 1
67
37
 
68
38
  nil
69
39
  end
70
40
 
71
- # Public: Verify internal counts are consistent.
41
+ # Public: Guess language of data.
42
+ #
43
+ # db - Hash of classifer tokens database.
44
+ # data - Array of tokens or String data to analyze.
45
+ # languages - Array of language name Strings to restrict to.
46
+ #
47
+ # Examples
72
48
  #
73
- # Returns Boolean.
74
- def verify
75
- @languages.inject(0) { |n, (l, c)| n += c } == @languages_total &&
76
- @language_tokens.inject(0) { |n, (l, c)| n += c } == @tokens_total &&
77
- @tokens.inject(0) { |n, (l, ts)| n += ts.inject(0) { |m, (t, c)| m += c } } == @tokens_total
49
+ # Classifier.classify(db, "def hello; end")
50
+ # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
51
+ #
52
+ # Returns sorted Array of result pairs. Each pair contains the
53
+ # String language name and a Float score.
54
+ def self.classify(db, tokens, languages = nil)
55
+ languages ||= db['languages'].keys
56
+ new(db).classify(tokens, languages)
78
57
  end
79
58
 
80
- # Public: Prune infrequent tokens.
81
- #
82
- # Returns receiver Classifier instance.
83
- def gc
84
- self
59
+ # Internal: Initialize a Classifier.
60
+ def initialize(db = {})
61
+ @tokens_total = db['tokens_total']
62
+ @languages_total = db['languages_total']
63
+ @tokens = db['tokens']
64
+ @language_tokens = db['language_tokens']
65
+ @languages = db['languages']
85
66
  end
86
67
 
87
- # Public: Guess language of data.
68
+ # Internal: Guess language of data
88
69
  #
89
70
  # data - Array of tokens or String data to analyze.
90
- # languages - Array of Languages to restrict to.
91
- #
92
- # Examples
93
- #
94
- # classify("def hello; end")
95
- # # => [ [Language['Ruby'], 0.90], [Language['Python'], 0.2], ... ]
71
+ # languages - Array of language name Strings to restrict to.
96
72
  #
97
73
  # Returns sorted Array of result pairs. Each pair contains the
98
- # Language and a Float score.
99
- def classify(tokens, languages = @languages.keys)
74
+ # String language name and a Float score.
75
+ def classify(tokens, languages)
100
76
  return [] if tokens.nil?
101
- tokens = Tokenizer.new(tokens).tokens if tokens.is_a?(String)
77
+ tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
102
78
 
103
79
  scores = {}
104
80
  languages.each do |language|
105
- language_name = language.is_a?(Language) ? language.name : language
106
- scores[language_name] = tokens_probability(tokens, language_name) +
107
- language_probability(language_name)
81
+ scores[language] = tokens_probability(tokens, language) +
82
+ language_probability(language)
108
83
  end
109
84
 
110
- scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] }
85
+ scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
111
86
  end
112
87
 
113
88
  # Internal: Probably of set of tokens in a language occuring - P(D | C)
@@ -144,41 +119,5 @@ module Linguist
144
119
  def language_probability(language)
145
120
  Math.log(@languages[language].to_f / @languages_total.to_f)
146
121
  end
147
-
148
- # Public: Serialize classifier to YAML.
149
- #
150
- # opts - Hash of YAML options.
151
- #
152
- # Returns nothing.
153
- def to_yaml(io)
154
- data = "--- !ruby/object:Linguist::Classifier\n"
155
-
156
- data << "languages_total: #{@languages_total}\n"
157
- data << "tokens_total: #{@tokens_total}\n"
158
-
159
- data << "languages:\n"
160
- @languages.sort.each do |language, count|
161
- data << " #{{language => count}.to_yaml.lines.to_a[1]}"
162
- end
163
-
164
- data << "language_tokens:\n"
165
- @language_tokens.sort.each do |language, count|
166
- data << " #{{language => count}.to_yaml.lines.to_a[1]}"
167
- end
168
-
169
- data << "tokens:\n"
170
- @tokens.sort.each do |language, tokens|
171
- data << " #{{language => true}.to_yaml.lines.to_a[1].sub(/ true/, "")}"
172
- tokens.sort.each do |token, count|
173
- data << " #{{token => count}.to_yaml.lines.to_a[1]}"
174
- end
175
- end
176
-
177
- io.write data
178
- nil
179
- end
180
122
  end
181
-
182
- # Eager load instance
183
- Classifier.instance if Classifier.exist?
184
123
  end