github-linguist 2.0.1 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/linguist +1 -1
- data/lib/linguist.rb +1 -1
- data/lib/linguist/blob_helper.rb +9 -194
- data/lib/linguist/classifier.rb +50 -111
- data/lib/linguist/language.rb +31 -16
- data/lib/linguist/languages.yml +110 -121
- data/lib/linguist/md5.rb +38 -0
- data/lib/linguist/repository.rb +1 -1
- data/lib/linguist/samples.json +20125 -0
- data/lib/linguist/samples.rb +94 -0
- data/lib/linguist/tokenizer.rb +34 -44
- metadata +21 -5
- data/lib/linguist/classifier.yml +0 -19013
- data/lib/linguist/pathname.rb +0 -92
- data/lib/linguist/sample.rb +0 -74
data/bin/linguist
CHANGED
data/lib/linguist.rb
CHANGED
data/lib/linguist/blob_helper.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'linguist/classifier'
|
2
2
|
require 'linguist/language'
|
3
3
|
require 'linguist/mime'
|
4
|
-
require 'linguist/
|
4
|
+
require 'linguist/samples'
|
5
5
|
|
6
6
|
require 'charlock_holmes'
|
7
7
|
require 'escape_utils'
|
@@ -12,13 +12,6 @@ module Linguist
|
|
12
12
|
# BlobHelper is a mixin for Blobish classes that respond to "name",
|
13
13
|
# "data" and "size" such as Grit::Blob.
|
14
14
|
module BlobHelper
|
15
|
-
# Internal: Get a Pathname wrapper for Blob#name
|
16
|
-
#
|
17
|
-
# Returns a Pathname.
|
18
|
-
def pathname
|
19
|
-
Pathname.new(name || "")
|
20
|
-
end
|
21
|
-
|
22
15
|
# Public: Get the extname of the path
|
23
16
|
#
|
24
17
|
# Examples
|
@@ -28,7 +21,7 @@ module Linguist
|
|
28
21
|
#
|
29
22
|
# Returns a String
|
30
23
|
def extname
|
31
|
-
|
24
|
+
File.extname(name)
|
32
25
|
end
|
33
26
|
|
34
27
|
# Public: Get the actual blob mime type
|
@@ -40,7 +33,7 @@ module Linguist
|
|
40
33
|
#
|
41
34
|
# Returns a mime type String.
|
42
35
|
def mime_type
|
43
|
-
@mime_type ||=
|
36
|
+
@mime_type ||= Mime.mime_for(extname)
|
44
37
|
end
|
45
38
|
|
46
39
|
# Public: Get the Content-Type header value
|
@@ -72,7 +65,7 @@ module Linguist
|
|
72
65
|
elsif name.nil?
|
73
66
|
"attachment"
|
74
67
|
else
|
75
|
-
"attachment; filename=#{EscapeUtils.escape_url(
|
68
|
+
"attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
|
76
69
|
end
|
77
70
|
end
|
78
71
|
|
@@ -95,7 +88,7 @@ module Linguist
|
|
95
88
|
#
|
96
89
|
# Return true or false
|
97
90
|
def binary_mime_type?
|
98
|
-
if mime_type = Mime.lookup_mime_type_for(
|
91
|
+
if mime_type = Mime.lookup_mime_type_for(extname)
|
99
92
|
mime_type.binary?
|
100
93
|
end
|
101
94
|
end
|
@@ -136,13 +129,6 @@ module Linguist
|
|
136
129
|
['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
|
137
130
|
end
|
138
131
|
|
139
|
-
# Public: Is the blob a possible drupal php file?
|
140
|
-
#
|
141
|
-
# Return true or false
|
142
|
-
def drupal_extname?
|
143
|
-
['.module', '.install', '.test', '.inc'].include?(extname)
|
144
|
-
end
|
145
|
-
|
146
132
|
# Public: Is the blob likely to have a shebang?
|
147
133
|
#
|
148
134
|
# Return true or false
|
@@ -428,10 +414,7 @@ module Linguist
|
|
428
414
|
disambiguate_extension_language ||
|
429
415
|
|
430
416
|
# See if there is a Language for the extension
|
431
|
-
|
432
|
-
|
433
|
-
# Look for idioms in first line
|
434
|
-
first_line_language ||
|
417
|
+
Language.find_by_filename(name) ||
|
435
418
|
|
436
419
|
# Try to detect Language from shebang line
|
437
420
|
shebang_language
|
@@ -446,179 +429,18 @@ module Linguist
|
|
446
429
|
|
447
430
|
# Internal: Disambiguates between multiple language extensions.
|
448
431
|
#
|
449
|
-
# Delegates to "guess_EXTENSION_language".
|
450
|
-
#
|
451
|
-
# Please add additional test coverage to
|
452
|
-
# `test/test_blob.rb#test_language` if you add another method.
|
453
|
-
#
|
454
432
|
# Returns a Language or nil.
|
455
433
|
def disambiguate_extension_language
|
456
434
|
if Language.ambiguous?(extname)
|
457
|
-
|
458
|
-
# send(name) if respond_to?(name)
|
459
|
-
|
460
|
-
possible_languages = Language.all.select { |l| l.extensions.include?(extname) }
|
435
|
+
possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
|
461
436
|
if possible_languages.any?
|
462
|
-
if result = Classifier.
|
463
|
-
result[0]
|
437
|
+
if result = Classifier.classify(Samples::DATA, data, possible_languages).first
|
438
|
+
Language[result[0]]
|
464
439
|
end
|
465
440
|
end
|
466
441
|
end
|
467
442
|
end
|
468
443
|
|
469
|
-
# Internal: Guess language of .cls files
|
470
|
-
#
|
471
|
-
# Returns a Language.
|
472
|
-
def guess_cls_language
|
473
|
-
if lines.grep(/^(%|\\)/).any?
|
474
|
-
Language['TeX']
|
475
|
-
elsif lines.grep(/^\s*(CLASS|METHOD|INTERFACE).*:\s*/i).any? || lines.grep(/^\s*(USING|DEFINE)/i).any?
|
476
|
-
Language['OpenEdge ABL']
|
477
|
-
elsif lines.grep(/\{$/).any? || lines.grep(/\}$/).any?
|
478
|
-
Language['Apex']
|
479
|
-
elsif lines.grep(/^(\'\*|Attribute|Option|Sub|Private|Protected|Public|Friend)/i).any?
|
480
|
-
Language['Visual Basic']
|
481
|
-
else
|
482
|
-
# The most common language should be the fallback
|
483
|
-
Language['TeX']
|
484
|
-
end
|
485
|
-
end
|
486
|
-
|
487
|
-
# Internal: Guess language of header files (.h).
|
488
|
-
#
|
489
|
-
# Returns a Language.
|
490
|
-
def guess_h_language
|
491
|
-
if lines.grep(/^@(interface|property|private|public|end)/).any?
|
492
|
-
Language['Objective-C']
|
493
|
-
elsif lines.grep(/^class |^\s+(public|protected|private):/).any?
|
494
|
-
Language['C++']
|
495
|
-
else
|
496
|
-
Language['C']
|
497
|
-
end
|
498
|
-
end
|
499
|
-
|
500
|
-
# Internal: Guess language of .m files.
|
501
|
-
#
|
502
|
-
# Objective-C heuristics:
|
503
|
-
# * Keywords ("#import", "#include", "#ifdef", #define, "@end") or "//" and opening "\*" comments
|
504
|
-
#
|
505
|
-
# Matlab heuristics:
|
506
|
-
# * Leading "function " of "classdef " keyword
|
507
|
-
# * "%" comments
|
508
|
-
#
|
509
|
-
# Note: All "#" keywords, e.g., "#import", are guaranteed to be Objective-C. Because the ampersand
|
510
|
-
# is used to created function handles and anonymous functions in Matlab, most "@" keywords are not
|
511
|
-
# safe heuristics. However, "end" is a reserved term in Matlab and can't be used to create a valid
|
512
|
-
# function handle. Because @end is required to close any @implementation, @property, @interface,
|
513
|
-
# @synthesize, etc. directive in Objective-C, only @end needs to be checked for.
|
514
|
-
#
|
515
|
-
# Returns a Language.
|
516
|
-
def guess_m_language
|
517
|
-
# Objective-C keywords or comments
|
518
|
-
if lines.grep(/^#(import|include|ifdef|define)|@end/).any? || lines.grep(/^\s*\/\//).any? || lines.grep(/^\s*\/\*/).any?
|
519
|
-
Language['Objective-C']
|
520
|
-
|
521
|
-
# Matlab file function or class or comments
|
522
|
-
elsif lines.any? && lines.first.match(/^\s*(function |classdef )/) || lines.grep(/^\s*%/).any?
|
523
|
-
Language['Matlab']
|
524
|
-
|
525
|
-
# Fallback to Objective-C, don't want any Matlab false positives
|
526
|
-
else
|
527
|
-
Language['Objective-C']
|
528
|
-
end
|
529
|
-
end
|
530
|
-
|
531
|
-
# Internal: Guess language of .pl files
|
532
|
-
#
|
533
|
-
# The rules for disambiguation are:
|
534
|
-
#
|
535
|
-
# 1. Many perl files begin with a shebang
|
536
|
-
# 2. Most Prolog source files have a rule somewhere (marked by the :- operator)
|
537
|
-
# 3. Default to Perl, because it is more popular
|
538
|
-
#
|
539
|
-
# Returns a Language.
|
540
|
-
def guess_pl_language
|
541
|
-
if shebang_script == 'perl'
|
542
|
-
Language['Perl']
|
543
|
-
elsif lines.grep(/:-/).any?
|
544
|
-
Language['Prolog']
|
545
|
-
else
|
546
|
-
Language['Perl']
|
547
|
-
end
|
548
|
-
end
|
549
|
-
|
550
|
-
# Internal: Guess language of .r files.
|
551
|
-
#
|
552
|
-
# Returns a Language.
|
553
|
-
def guess_r_language
|
554
|
-
if lines.grep(/(rebol|(:\s+func|make\s+object!|^\s*context)\s*\[)/i).any?
|
555
|
-
Language['Rebol']
|
556
|
-
else
|
557
|
-
Language['R']
|
558
|
-
end
|
559
|
-
end
|
560
|
-
|
561
|
-
# Internal: Guess language of .t files.
|
562
|
-
#
|
563
|
-
# Returns a Language.
|
564
|
-
def guess_t_language
|
565
|
-
score = 0
|
566
|
-
score += 1 if lines.grep(/^% /).any?
|
567
|
-
score += data.gsub(/ := /).count
|
568
|
-
score += data.gsub(/proc |procedure |fcn |function /).count
|
569
|
-
score += data.gsub(/var \w+: \w+/).count
|
570
|
-
|
571
|
-
# Tell-tale signs its gotta be Perl
|
572
|
-
if lines.grep(/^(my )?(sub |\$|@|%)\w+/).any?
|
573
|
-
score = 0
|
574
|
-
end
|
575
|
-
|
576
|
-
if score >= 3
|
577
|
-
Language['Turing']
|
578
|
-
else
|
579
|
-
Language['Perl']
|
580
|
-
end
|
581
|
-
end
|
582
|
-
|
583
|
-
# Internal: Guess language of .v files.
|
584
|
-
#
|
585
|
-
# Returns a Language
|
586
|
-
def guess_v_language
|
587
|
-
if lines.grep(/^(\/\*|\/\/|module|parameter|input|output|wire|reg|always|initial|begin|\`)/).any?
|
588
|
-
Language['Verilog']
|
589
|
-
else
|
590
|
-
Language['Coq']
|
591
|
-
end
|
592
|
-
end
|
593
|
-
|
594
|
-
# Internal: Guess language of .gsp files.
|
595
|
-
#
|
596
|
-
# Returns a Language.
|
597
|
-
def guess_gsp_language
|
598
|
-
if lines.grep(/<%|<%@|\$\{|<%|<g:|<meta name="layout"|<r:/).any?
|
599
|
-
Language['Groovy Server Pages']
|
600
|
-
else
|
601
|
-
Language['Gosu']
|
602
|
-
end
|
603
|
-
end
|
604
|
-
|
605
|
-
# Internal: Guess language from the first line.
|
606
|
-
#
|
607
|
-
# Look for leading "<?php" in Drupal files
|
608
|
-
#
|
609
|
-
# Returns a Language.
|
610
|
-
def first_line_language
|
611
|
-
# Only check files with drupal php extensions
|
612
|
-
return unless drupal_extname?
|
613
|
-
|
614
|
-
# Fail fast if blob isn't viewable?
|
615
|
-
return unless viewable?
|
616
|
-
|
617
|
-
if lines.first.to_s =~ /^<\?php/
|
618
|
-
Language['PHP']
|
619
|
-
end
|
620
|
-
end
|
621
|
-
|
622
444
|
# Internal: Extract the script name from the shebang line
|
623
445
|
#
|
624
446
|
# Requires Blob#data
|
@@ -710,12 +532,5 @@ module Linguist
|
|
710
532
|
''
|
711
533
|
end
|
712
534
|
end
|
713
|
-
|
714
|
-
Language.overridden_extensions.each do |extension|
|
715
|
-
name = "guess_#{extension.sub(/^\./, '')}_language".to_sym
|
716
|
-
unless instance_methods.map(&:to_sym).include?(name)
|
717
|
-
raise NotImplementedError, "Language##{name} was not defined"
|
718
|
-
end
|
719
|
-
end
|
720
535
|
end
|
721
536
|
end
|
data/lib/linguist/classifier.rb
CHANGED
@@ -1,113 +1,88 @@
|
|
1
|
-
require 'linguist/language'
|
2
1
|
require 'linguist/tokenizer'
|
3
2
|
|
4
3
|
module Linguist
|
5
4
|
# Language bayesian classifier.
|
6
5
|
class Classifier
|
7
|
-
# Internal: Path to persisted classifier db.
|
8
|
-
PATH = File.expand_path('../classifier.yml', __FILE__)
|
9
|
-
|
10
|
-
# Public: Check if persisted db exists on disk.
|
11
|
-
#
|
12
|
-
# Returns Boolean.
|
13
|
-
def self.exist?
|
14
|
-
File.exist?(PATH)
|
15
|
-
end
|
16
|
-
|
17
|
-
# Public: Get persisted Classifier instance.
|
18
|
-
#
|
19
|
-
# Returns Classifier.
|
20
|
-
def self.instance
|
21
|
-
@instance ||= YAML.load_file(PATH)
|
22
|
-
end
|
23
|
-
|
24
|
-
# Public: Initialize a Classifier.
|
25
|
-
def initialize
|
26
|
-
@tokens_total = 0
|
27
|
-
@languages_total = 0
|
28
|
-
@tokens = Hash.new { |h, k| h[k] = Hash.new(0) }
|
29
|
-
@language_tokens = Hash.new(0)
|
30
|
-
@languages = Hash.new(0)
|
31
|
-
end
|
32
|
-
|
33
|
-
# Public: Compare Classifier objects.
|
34
|
-
#
|
35
|
-
# other - Classifier object to compare to.
|
36
|
-
#
|
37
|
-
# Returns Boolean.
|
38
|
-
def eql?(other)
|
39
|
-
# Lazy fast check counts only
|
40
|
-
other.is_a?(self.class) &&
|
41
|
-
@tokens_total == other.instance_variable_get(:@tokens_total) &&
|
42
|
-
@languages_total == other.instance_variable_get(:@languages_total)
|
43
|
-
end
|
44
|
-
alias_method :==, :eql?
|
45
|
-
|
46
6
|
# Public: Train classifier that data is a certain language.
|
47
7
|
#
|
48
|
-
#
|
8
|
+
# db - Hash classifier database object
|
9
|
+
# language - String language of data
|
49
10
|
# data - String contents of file
|
50
11
|
#
|
51
12
|
# Examples
|
52
13
|
#
|
53
|
-
# train(
|
14
|
+
# Classifier.train(db, 'Ruby', "def hello; end")
|
54
15
|
#
|
55
16
|
# Returns nothing.
|
56
|
-
def train(language, data)
|
57
|
-
|
58
|
-
|
17
|
+
def self.train!(db, language, data)
|
18
|
+
tokens = Tokenizer.tokenize(data)
|
19
|
+
|
20
|
+
db['tokens_total'] ||= 0
|
21
|
+
db['languages_total'] ||= 0
|
22
|
+
db['tokens'] ||= {}
|
23
|
+
db['language_tokens'] ||= {}
|
24
|
+
db['languages'] ||= {}
|
59
25
|
|
60
26
|
tokens.each do |token|
|
61
|
-
|
62
|
-
|
63
|
-
|
27
|
+
db['tokens'][language] ||= {}
|
28
|
+
db['tokens'][language][token] ||= 0
|
29
|
+
db['tokens'][language][token] += 1
|
30
|
+
db['language_tokens'][language] ||= 0
|
31
|
+
db['language_tokens'][language] += 1
|
32
|
+
db['tokens_total'] += 1
|
64
33
|
end
|
65
|
-
|
66
|
-
|
34
|
+
db['languages'][language] ||= 0
|
35
|
+
db['languages'][language] += 1
|
36
|
+
db['languages_total'] += 1
|
67
37
|
|
68
38
|
nil
|
69
39
|
end
|
70
40
|
|
71
|
-
# Public:
|
41
|
+
# Public: Guess language of data.
|
42
|
+
#
|
43
|
+
# db - Hash of classifer tokens database.
|
44
|
+
# data - Array of tokens or String data to analyze.
|
45
|
+
# languages - Array of language name Strings to restrict to.
|
46
|
+
#
|
47
|
+
# Examples
|
72
48
|
#
|
73
|
-
#
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
49
|
+
# Classifier.classify(db, "def hello; end")
|
50
|
+
# # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
|
51
|
+
#
|
52
|
+
# Returns sorted Array of result pairs. Each pair contains the
|
53
|
+
# String language name and a Float score.
|
54
|
+
def self.classify(db, tokens, languages = nil)
|
55
|
+
languages ||= db['languages'].keys
|
56
|
+
new(db).classify(tokens, languages)
|
78
57
|
end
|
79
58
|
|
80
|
-
#
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
59
|
+
# Internal: Initialize a Classifier.
|
60
|
+
def initialize(db = {})
|
61
|
+
@tokens_total = db['tokens_total']
|
62
|
+
@languages_total = db['languages_total']
|
63
|
+
@tokens = db['tokens']
|
64
|
+
@language_tokens = db['language_tokens']
|
65
|
+
@languages = db['languages']
|
85
66
|
end
|
86
67
|
|
87
|
-
#
|
68
|
+
# Internal: Guess language of data
|
88
69
|
#
|
89
70
|
# data - Array of tokens or String data to analyze.
|
90
|
-
# languages - Array of
|
91
|
-
#
|
92
|
-
# Examples
|
93
|
-
#
|
94
|
-
# classify("def hello; end")
|
95
|
-
# # => [ [Language['Ruby'], 0.90], [Language['Python'], 0.2], ... ]
|
71
|
+
# languages - Array of language name Strings to restrict to.
|
96
72
|
#
|
97
73
|
# Returns sorted Array of result pairs. Each pair contains the
|
98
|
-
#
|
99
|
-
def classify(tokens, languages
|
74
|
+
# String language name and a Float score.
|
75
|
+
def classify(tokens, languages)
|
100
76
|
return [] if tokens.nil?
|
101
|
-
tokens = Tokenizer.
|
77
|
+
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
102
78
|
|
103
79
|
scores = {}
|
104
80
|
languages.each do |language|
|
105
|
-
|
106
|
-
|
107
|
-
language_probability(language_name)
|
81
|
+
scores[language] = tokens_probability(tokens, language) +
|
82
|
+
language_probability(language)
|
108
83
|
end
|
109
84
|
|
110
|
-
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [
|
85
|
+
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
|
111
86
|
end
|
112
87
|
|
113
88
|
# Internal: Probably of set of tokens in a language occuring - P(D | C)
|
@@ -144,41 +119,5 @@ module Linguist
|
|
144
119
|
def language_probability(language)
|
145
120
|
Math.log(@languages[language].to_f / @languages_total.to_f)
|
146
121
|
end
|
147
|
-
|
148
|
-
# Public: Serialize classifier to YAML.
|
149
|
-
#
|
150
|
-
# opts - Hash of YAML options.
|
151
|
-
#
|
152
|
-
# Returns nothing.
|
153
|
-
def to_yaml(io)
|
154
|
-
data = "--- !ruby/object:Linguist::Classifier\n"
|
155
|
-
|
156
|
-
data << "languages_total: #{@languages_total}\n"
|
157
|
-
data << "tokens_total: #{@tokens_total}\n"
|
158
|
-
|
159
|
-
data << "languages:\n"
|
160
|
-
@languages.sort.each do |language, count|
|
161
|
-
data << " #{{language => count}.to_yaml.lines.to_a[1]}"
|
162
|
-
end
|
163
|
-
|
164
|
-
data << "language_tokens:\n"
|
165
|
-
@language_tokens.sort.each do |language, count|
|
166
|
-
data << " #{{language => count}.to_yaml.lines.to_a[1]}"
|
167
|
-
end
|
168
|
-
|
169
|
-
data << "tokens:\n"
|
170
|
-
@tokens.sort.each do |language, tokens|
|
171
|
-
data << " #{{language => true}.to_yaml.lines.to_a[1].sub(/ true/, "")}"
|
172
|
-
tokens.sort.each do |token, count|
|
173
|
-
data << " #{{token => count}.to_yaml.lines.to_a[1]}"
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
io.write data
|
178
|
-
nil
|
179
|
-
end
|
180
122
|
end
|
181
|
-
|
182
|
-
# Eager load instance
|
183
|
-
Classifier.instance if Classifier.exist?
|
184
123
|
end
|