github-linguist 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/linguist +1 -1
- data/lib/linguist.rb +1 -1
- data/lib/linguist/blob_helper.rb +9 -194
- data/lib/linguist/classifier.rb +50 -111
- data/lib/linguist/language.rb +31 -16
- data/lib/linguist/languages.yml +110 -121
- data/lib/linguist/md5.rb +38 -0
- data/lib/linguist/repository.rb +1 -1
- data/lib/linguist/samples.json +20125 -0
- data/lib/linguist/samples.rb +94 -0
- data/lib/linguist/tokenizer.rb +34 -44
- metadata +21 -5
- data/lib/linguist/classifier.yml +0 -19013
- data/lib/linguist/pathname.rb +0 -92
- data/lib/linguist/sample.rb +0 -74
data/bin/linguist
CHANGED
data/lib/linguist.rb
CHANGED
data/lib/linguist/blob_helper.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'linguist/classifier'
|
2
2
|
require 'linguist/language'
|
3
3
|
require 'linguist/mime'
|
4
|
-
require 'linguist/
|
4
|
+
require 'linguist/samples'
|
5
5
|
|
6
6
|
require 'charlock_holmes'
|
7
7
|
require 'escape_utils'
|
@@ -12,13 +12,6 @@ module Linguist
|
|
12
12
|
# BlobHelper is a mixin for Blobish classes that respond to "name",
|
13
13
|
# "data" and "size" such as Grit::Blob.
|
14
14
|
module BlobHelper
|
15
|
-
# Internal: Get a Pathname wrapper for Blob#name
|
16
|
-
#
|
17
|
-
# Returns a Pathname.
|
18
|
-
def pathname
|
19
|
-
Pathname.new(name || "")
|
20
|
-
end
|
21
|
-
|
22
15
|
# Public: Get the extname of the path
|
23
16
|
#
|
24
17
|
# Examples
|
@@ -28,7 +21,7 @@ module Linguist
|
|
28
21
|
#
|
29
22
|
# Returns a String
|
30
23
|
def extname
|
31
|
-
|
24
|
+
File.extname(name)
|
32
25
|
end
|
33
26
|
|
34
27
|
# Public: Get the actual blob mime type
|
@@ -40,7 +33,7 @@ module Linguist
|
|
40
33
|
#
|
41
34
|
# Returns a mime type String.
|
42
35
|
def mime_type
|
43
|
-
@mime_type ||=
|
36
|
+
@mime_type ||= Mime.mime_for(extname)
|
44
37
|
end
|
45
38
|
|
46
39
|
# Public: Get the Content-Type header value
|
@@ -72,7 +65,7 @@ module Linguist
|
|
72
65
|
elsif name.nil?
|
73
66
|
"attachment"
|
74
67
|
else
|
75
|
-
"attachment; filename=#{EscapeUtils.escape_url(
|
68
|
+
"attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
|
76
69
|
end
|
77
70
|
end
|
78
71
|
|
@@ -95,7 +88,7 @@ module Linguist
|
|
95
88
|
#
|
96
89
|
# Return true or false
|
97
90
|
def binary_mime_type?
|
98
|
-
if mime_type = Mime.lookup_mime_type_for(
|
91
|
+
if mime_type = Mime.lookup_mime_type_for(extname)
|
99
92
|
mime_type.binary?
|
100
93
|
end
|
101
94
|
end
|
@@ -136,13 +129,6 @@ module Linguist
|
|
136
129
|
['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
|
137
130
|
end
|
138
131
|
|
139
|
-
# Public: Is the blob a possible drupal php file?
|
140
|
-
#
|
141
|
-
# Return true or false
|
142
|
-
def drupal_extname?
|
143
|
-
['.module', '.install', '.test', '.inc'].include?(extname)
|
144
|
-
end
|
145
|
-
|
146
132
|
# Public: Is the blob likely to have a shebang?
|
147
133
|
#
|
148
134
|
# Return true or false
|
@@ -428,10 +414,7 @@ module Linguist
|
|
428
414
|
disambiguate_extension_language ||
|
429
415
|
|
430
416
|
# See if there is a Language for the extension
|
431
|
-
|
432
|
-
|
433
|
-
# Look for idioms in first line
|
434
|
-
first_line_language ||
|
417
|
+
Language.find_by_filename(name) ||
|
435
418
|
|
436
419
|
# Try to detect Language from shebang line
|
437
420
|
shebang_language
|
@@ -446,179 +429,18 @@ module Linguist
|
|
446
429
|
|
447
430
|
# Internal: Disambiguates between multiple language extensions.
|
448
431
|
#
|
449
|
-
# Delegates to "guess_EXTENSION_language".
|
450
|
-
#
|
451
|
-
# Please add additional test coverage to
|
452
|
-
# `test/test_blob.rb#test_language` if you add another method.
|
453
|
-
#
|
454
432
|
# Returns a Language or nil.
|
455
433
|
def disambiguate_extension_language
|
456
434
|
if Language.ambiguous?(extname)
|
457
|
-
|
458
|
-
# send(name) if respond_to?(name)
|
459
|
-
|
460
|
-
possible_languages = Language.all.select { |l| l.extensions.include?(extname) }
|
435
|
+
possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
|
461
436
|
if possible_languages.any?
|
462
|
-
if result = Classifier.
|
463
|
-
result[0]
|
437
|
+
if result = Classifier.classify(Samples::DATA, data, possible_languages).first
|
438
|
+
Language[result[0]]
|
464
439
|
end
|
465
440
|
end
|
466
441
|
end
|
467
442
|
end
|
468
443
|
|
469
|
-
# Internal: Guess language of .cls files
|
470
|
-
#
|
471
|
-
# Returns a Language.
|
472
|
-
def guess_cls_language
|
473
|
-
if lines.grep(/^(%|\\)/).any?
|
474
|
-
Language['TeX']
|
475
|
-
elsif lines.grep(/^\s*(CLASS|METHOD|INTERFACE).*:\s*/i).any? || lines.grep(/^\s*(USING|DEFINE)/i).any?
|
476
|
-
Language['OpenEdge ABL']
|
477
|
-
elsif lines.grep(/\{$/).any? || lines.grep(/\}$/).any?
|
478
|
-
Language['Apex']
|
479
|
-
elsif lines.grep(/^(\'\*|Attribute|Option|Sub|Private|Protected|Public|Friend)/i).any?
|
480
|
-
Language['Visual Basic']
|
481
|
-
else
|
482
|
-
# The most common language should be the fallback
|
483
|
-
Language['TeX']
|
484
|
-
end
|
485
|
-
end
|
486
|
-
|
487
|
-
# Internal: Guess language of header files (.h).
|
488
|
-
#
|
489
|
-
# Returns a Language.
|
490
|
-
def guess_h_language
|
491
|
-
if lines.grep(/^@(interface|property|private|public|end)/).any?
|
492
|
-
Language['Objective-C']
|
493
|
-
elsif lines.grep(/^class |^\s+(public|protected|private):/).any?
|
494
|
-
Language['C++']
|
495
|
-
else
|
496
|
-
Language['C']
|
497
|
-
end
|
498
|
-
end
|
499
|
-
|
500
|
-
# Internal: Guess language of .m files.
|
501
|
-
#
|
502
|
-
# Objective-C heuristics:
|
503
|
-
# * Keywords ("#import", "#include", "#ifdef", #define, "@end") or "//" and opening "\*" comments
|
504
|
-
#
|
505
|
-
# Matlab heuristics:
|
506
|
-
# * Leading "function " of "classdef " keyword
|
507
|
-
# * "%" comments
|
508
|
-
#
|
509
|
-
# Note: All "#" keywords, e.g., "#import", are guaranteed to be Objective-C. Because the ampersand
|
510
|
-
# is used to created function handles and anonymous functions in Matlab, most "@" keywords are not
|
511
|
-
# safe heuristics. However, "end" is a reserved term in Matlab and can't be used to create a valid
|
512
|
-
# function handle. Because @end is required to close any @implementation, @property, @interface,
|
513
|
-
# @synthesize, etc. directive in Objective-C, only @end needs to be checked for.
|
514
|
-
#
|
515
|
-
# Returns a Language.
|
516
|
-
def guess_m_language
|
517
|
-
# Objective-C keywords or comments
|
518
|
-
if lines.grep(/^#(import|include|ifdef|define)|@end/).any? || lines.grep(/^\s*\/\//).any? || lines.grep(/^\s*\/\*/).any?
|
519
|
-
Language['Objective-C']
|
520
|
-
|
521
|
-
# Matlab file function or class or comments
|
522
|
-
elsif lines.any? && lines.first.match(/^\s*(function |classdef )/) || lines.grep(/^\s*%/).any?
|
523
|
-
Language['Matlab']
|
524
|
-
|
525
|
-
# Fallback to Objective-C, don't want any Matlab false positives
|
526
|
-
else
|
527
|
-
Language['Objective-C']
|
528
|
-
end
|
529
|
-
end
|
530
|
-
|
531
|
-
# Internal: Guess language of .pl files
|
532
|
-
#
|
533
|
-
# The rules for disambiguation are:
|
534
|
-
#
|
535
|
-
# 1. Many perl files begin with a shebang
|
536
|
-
# 2. Most Prolog source files have a rule somewhere (marked by the :- operator)
|
537
|
-
# 3. Default to Perl, because it is more popular
|
538
|
-
#
|
539
|
-
# Returns a Language.
|
540
|
-
def guess_pl_language
|
541
|
-
if shebang_script == 'perl'
|
542
|
-
Language['Perl']
|
543
|
-
elsif lines.grep(/:-/).any?
|
544
|
-
Language['Prolog']
|
545
|
-
else
|
546
|
-
Language['Perl']
|
547
|
-
end
|
548
|
-
end
|
549
|
-
|
550
|
-
# Internal: Guess language of .r files.
|
551
|
-
#
|
552
|
-
# Returns a Language.
|
553
|
-
def guess_r_language
|
554
|
-
if lines.grep(/(rebol|(:\s+func|make\s+object!|^\s*context)\s*\[)/i).any?
|
555
|
-
Language['Rebol']
|
556
|
-
else
|
557
|
-
Language['R']
|
558
|
-
end
|
559
|
-
end
|
560
|
-
|
561
|
-
# Internal: Guess language of .t files.
|
562
|
-
#
|
563
|
-
# Returns a Language.
|
564
|
-
def guess_t_language
|
565
|
-
score = 0
|
566
|
-
score += 1 if lines.grep(/^% /).any?
|
567
|
-
score += data.gsub(/ := /).count
|
568
|
-
score += data.gsub(/proc |procedure |fcn |function /).count
|
569
|
-
score += data.gsub(/var \w+: \w+/).count
|
570
|
-
|
571
|
-
# Tell-tale signs its gotta be Perl
|
572
|
-
if lines.grep(/^(my )?(sub |\$|@|%)\w+/).any?
|
573
|
-
score = 0
|
574
|
-
end
|
575
|
-
|
576
|
-
if score >= 3
|
577
|
-
Language['Turing']
|
578
|
-
else
|
579
|
-
Language['Perl']
|
580
|
-
end
|
581
|
-
end
|
582
|
-
|
583
|
-
# Internal: Guess language of .v files.
|
584
|
-
#
|
585
|
-
# Returns a Language
|
586
|
-
def guess_v_language
|
587
|
-
if lines.grep(/^(\/\*|\/\/|module|parameter|input|output|wire|reg|always|initial|begin|\`)/).any?
|
588
|
-
Language['Verilog']
|
589
|
-
else
|
590
|
-
Language['Coq']
|
591
|
-
end
|
592
|
-
end
|
593
|
-
|
594
|
-
# Internal: Guess language of .gsp files.
|
595
|
-
#
|
596
|
-
# Returns a Language.
|
597
|
-
def guess_gsp_language
|
598
|
-
if lines.grep(/<%|<%@|\$\{|<%|<g:|<meta name="layout"|<r:/).any?
|
599
|
-
Language['Groovy Server Pages']
|
600
|
-
else
|
601
|
-
Language['Gosu']
|
602
|
-
end
|
603
|
-
end
|
604
|
-
|
605
|
-
# Internal: Guess language from the first line.
|
606
|
-
#
|
607
|
-
# Look for leading "<?php" in Drupal files
|
608
|
-
#
|
609
|
-
# Returns a Language.
|
610
|
-
def first_line_language
|
611
|
-
# Only check files with drupal php extensions
|
612
|
-
return unless drupal_extname?
|
613
|
-
|
614
|
-
# Fail fast if blob isn't viewable?
|
615
|
-
return unless viewable?
|
616
|
-
|
617
|
-
if lines.first.to_s =~ /^<\?php/
|
618
|
-
Language['PHP']
|
619
|
-
end
|
620
|
-
end
|
621
|
-
|
622
444
|
# Internal: Extract the script name from the shebang line
|
623
445
|
#
|
624
446
|
# Requires Blob#data
|
@@ -710,12 +532,5 @@ module Linguist
|
|
710
532
|
''
|
711
533
|
end
|
712
534
|
end
|
713
|
-
|
714
|
-
Language.overridden_extensions.each do |extension|
|
715
|
-
name = "guess_#{extension.sub(/^\./, '')}_language".to_sym
|
716
|
-
unless instance_methods.map(&:to_sym).include?(name)
|
717
|
-
raise NotImplementedError, "Language##{name} was not defined"
|
718
|
-
end
|
719
|
-
end
|
720
535
|
end
|
721
536
|
end
|
data/lib/linguist/classifier.rb
CHANGED
@@ -1,113 +1,88 @@
|
|
1
|
-
require 'linguist/language'
|
2
1
|
require 'linguist/tokenizer'
|
3
2
|
|
4
3
|
module Linguist
|
5
4
|
# Language bayesian classifier.
|
6
5
|
class Classifier
|
7
|
-
# Internal: Path to persisted classifier db.
|
8
|
-
PATH = File.expand_path('../classifier.yml', __FILE__)
|
9
|
-
|
10
|
-
# Public: Check if persisted db exists on disk.
|
11
|
-
#
|
12
|
-
# Returns Boolean.
|
13
|
-
def self.exist?
|
14
|
-
File.exist?(PATH)
|
15
|
-
end
|
16
|
-
|
17
|
-
# Public: Get persisted Classifier instance.
|
18
|
-
#
|
19
|
-
# Returns Classifier.
|
20
|
-
def self.instance
|
21
|
-
@instance ||= YAML.load_file(PATH)
|
22
|
-
end
|
23
|
-
|
24
|
-
# Public: Initialize a Classifier.
|
25
|
-
def initialize
|
26
|
-
@tokens_total = 0
|
27
|
-
@languages_total = 0
|
28
|
-
@tokens = Hash.new { |h, k| h[k] = Hash.new(0) }
|
29
|
-
@language_tokens = Hash.new(0)
|
30
|
-
@languages = Hash.new(0)
|
31
|
-
end
|
32
|
-
|
33
|
-
# Public: Compare Classifier objects.
|
34
|
-
#
|
35
|
-
# other - Classifier object to compare to.
|
36
|
-
#
|
37
|
-
# Returns Boolean.
|
38
|
-
def eql?(other)
|
39
|
-
# Lazy fast check counts only
|
40
|
-
other.is_a?(self.class) &&
|
41
|
-
@tokens_total == other.instance_variable_get(:@tokens_total) &&
|
42
|
-
@languages_total == other.instance_variable_get(:@languages_total)
|
43
|
-
end
|
44
|
-
alias_method :==, :eql?
|
45
|
-
|
46
6
|
# Public: Train classifier that data is a certain language.
|
47
7
|
#
|
48
|
-
#
|
8
|
+
# db - Hash classifier database object
|
9
|
+
# language - String language of data
|
49
10
|
# data - String contents of file
|
50
11
|
#
|
51
12
|
# Examples
|
52
13
|
#
|
53
|
-
# train(
|
14
|
+
# Classifier.train(db, 'Ruby', "def hello; end")
|
54
15
|
#
|
55
16
|
# Returns nothing.
|
56
|
-
def train(language, data)
|
57
|
-
|
58
|
-
|
17
|
+
def self.train!(db, language, data)
|
18
|
+
tokens = Tokenizer.tokenize(data)
|
19
|
+
|
20
|
+
db['tokens_total'] ||= 0
|
21
|
+
db['languages_total'] ||= 0
|
22
|
+
db['tokens'] ||= {}
|
23
|
+
db['language_tokens'] ||= {}
|
24
|
+
db['languages'] ||= {}
|
59
25
|
|
60
26
|
tokens.each do |token|
|
61
|
-
|
62
|
-
|
63
|
-
|
27
|
+
db['tokens'][language] ||= {}
|
28
|
+
db['tokens'][language][token] ||= 0
|
29
|
+
db['tokens'][language][token] += 1
|
30
|
+
db['language_tokens'][language] ||= 0
|
31
|
+
db['language_tokens'][language] += 1
|
32
|
+
db['tokens_total'] += 1
|
64
33
|
end
|
65
|
-
|
66
|
-
|
34
|
+
db['languages'][language] ||= 0
|
35
|
+
db['languages'][language] += 1
|
36
|
+
db['languages_total'] += 1
|
67
37
|
|
68
38
|
nil
|
69
39
|
end
|
70
40
|
|
71
|
-
# Public:
|
41
|
+
# Public: Guess language of data.
|
42
|
+
#
|
43
|
+
# db - Hash of classifer tokens database.
|
44
|
+
# data - Array of tokens or String data to analyze.
|
45
|
+
# languages - Array of language name Strings to restrict to.
|
46
|
+
#
|
47
|
+
# Examples
|
72
48
|
#
|
73
|
-
#
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
49
|
+
# Classifier.classify(db, "def hello; end")
|
50
|
+
# # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
|
51
|
+
#
|
52
|
+
# Returns sorted Array of result pairs. Each pair contains the
|
53
|
+
# String language name and a Float score.
|
54
|
+
def self.classify(db, tokens, languages = nil)
|
55
|
+
languages ||= db['languages'].keys
|
56
|
+
new(db).classify(tokens, languages)
|
78
57
|
end
|
79
58
|
|
80
|
-
#
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
59
|
+
# Internal: Initialize a Classifier.
|
60
|
+
def initialize(db = {})
|
61
|
+
@tokens_total = db['tokens_total']
|
62
|
+
@languages_total = db['languages_total']
|
63
|
+
@tokens = db['tokens']
|
64
|
+
@language_tokens = db['language_tokens']
|
65
|
+
@languages = db['languages']
|
85
66
|
end
|
86
67
|
|
87
|
-
#
|
68
|
+
# Internal: Guess language of data
|
88
69
|
#
|
89
70
|
# data - Array of tokens or String data to analyze.
|
90
|
-
# languages - Array of
|
91
|
-
#
|
92
|
-
# Examples
|
93
|
-
#
|
94
|
-
# classify("def hello; end")
|
95
|
-
# # => [ [Language['Ruby'], 0.90], [Language['Python'], 0.2], ... ]
|
71
|
+
# languages - Array of language name Strings to restrict to.
|
96
72
|
#
|
97
73
|
# Returns sorted Array of result pairs. Each pair contains the
|
98
|
-
#
|
99
|
-
def classify(tokens, languages
|
74
|
+
# String language name and a Float score.
|
75
|
+
def classify(tokens, languages)
|
100
76
|
return [] if tokens.nil?
|
101
|
-
tokens = Tokenizer.
|
77
|
+
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
102
78
|
|
103
79
|
scores = {}
|
104
80
|
languages.each do |language|
|
105
|
-
|
106
|
-
|
107
|
-
language_probability(language_name)
|
81
|
+
scores[language] = tokens_probability(tokens, language) +
|
82
|
+
language_probability(language)
|
108
83
|
end
|
109
84
|
|
110
|
-
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [
|
85
|
+
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
|
111
86
|
end
|
112
87
|
|
113
88
|
# Internal: Probably of set of tokens in a language occuring - P(D | C)
|
@@ -144,41 +119,5 @@ module Linguist
|
|
144
119
|
def language_probability(language)
|
145
120
|
Math.log(@languages[language].to_f / @languages_total.to_f)
|
146
121
|
end
|
147
|
-
|
148
|
-
# Public: Serialize classifier to YAML.
|
149
|
-
#
|
150
|
-
# opts - Hash of YAML options.
|
151
|
-
#
|
152
|
-
# Returns nothing.
|
153
|
-
def to_yaml(io)
|
154
|
-
data = "--- !ruby/object:Linguist::Classifier\n"
|
155
|
-
|
156
|
-
data << "languages_total: #{@languages_total}\n"
|
157
|
-
data << "tokens_total: #{@tokens_total}\n"
|
158
|
-
|
159
|
-
data << "languages:\n"
|
160
|
-
@languages.sort.each do |language, count|
|
161
|
-
data << " #{{language => count}.to_yaml.lines.to_a[1]}"
|
162
|
-
end
|
163
|
-
|
164
|
-
data << "language_tokens:\n"
|
165
|
-
@language_tokens.sort.each do |language, count|
|
166
|
-
data << " #{{language => count}.to_yaml.lines.to_a[1]}"
|
167
|
-
end
|
168
|
-
|
169
|
-
data << "tokens:\n"
|
170
|
-
@tokens.sort.each do |language, tokens|
|
171
|
-
data << " #{{language => true}.to_yaml.lines.to_a[1].sub(/ true/, "")}"
|
172
|
-
tokens.sort.each do |token, count|
|
173
|
-
data << " #{{token => count}.to_yaml.lines.to_a[1]}"
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
io.write data
|
178
|
-
nil
|
179
|
-
end
|
180
122
|
end
|
181
|
-
|
182
|
-
# Eager load instance
|
183
|
-
Classifier.instance if Classifier.exist?
|
184
123
|
end
|