github-linguist 4.0.3 → 4.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/linguist.rb +1 -0
- data/lib/linguist/classifier.rb +19 -0
- data/lib/linguist/file_blob.rb +14 -8
- data/lib/linguist/heuristics.rb +112 -110
- data/lib/linguist/language.rb +39 -46
- data/lib/linguist/languages.json +1 -1
- data/lib/linguist/languages.yml +77 -6
- data/lib/linguist/samples.json +3292 -454
- data/lib/linguist/samples.rb +6 -39
- data/lib/linguist/shebang.rb +44 -0
- data/lib/linguist/strategy/filename.rb +20 -0
- data/lib/linguist/vendor.yml +0 -3
- data/lib/linguist/version.rb +1 -1
- metadata +6 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4fd8402379e8ac3de17921cf9831e6db303bea33
|
4
|
+
data.tar.gz: 546482b4f73f6c6a512b0258e0768aeb51fdbfb1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 73d9c3e80b0884a3ef96f5281028bf943f183de4f11aecd5ef31b986fb8f13d4fa70a38a475a0ca6ba0e3aa20cd3b8965edd41e5f26f15e35ae375c30974382b
|
7
|
+
data.tar.gz: 18d6610244f861a7c4f925b1332370b39901fb7f640efba8b5a83ca7f80321aa17cf5757dd532b7c54206b1c2b66e99c00aafe2213aefcd695205af7afc01bff
|
data/lib/linguist.rb
CHANGED
data/lib/linguist/classifier.rb
CHANGED
@@ -3,6 +3,25 @@ require 'linguist/tokenizer'
|
|
3
3
|
module Linguist
|
4
4
|
# Language bayesian classifier.
|
5
5
|
class Classifier
|
6
|
+
# Public: Use the classifier to detect language of the blob.
|
7
|
+
#
|
8
|
+
# blob - An object that quacks like a blob.
|
9
|
+
# possible_languages - Array of Language objects
|
10
|
+
#
|
11
|
+
# Examples
|
12
|
+
#
|
13
|
+
# Classifier.call(FileBlob.new("path/to/file"), [
|
14
|
+
# Language["Ruby"], Language["Python"]
|
15
|
+
# ])
|
16
|
+
#
|
17
|
+
# Returns an Array of Language objects, most probable first.
|
18
|
+
def self.call(blob, possible_languages)
|
19
|
+
language_names = possible_languages.map(&:name)
|
20
|
+
classify(Samples.cache, blob.data, language_names).map do |name, _|
|
21
|
+
Language[name] # Return the actual Language objects
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
6
25
|
# Public: Train classifier that data is a certain language.
|
7
26
|
#
|
8
27
|
# db - Hash classifier database object
|
data/lib/linguist/file_blob.rb
CHANGED
@@ -57,14 +57,20 @@ module Linguist
|
|
57
57
|
#
|
58
58
|
# Returns a String.
|
59
59
|
def extension
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
60
|
+
extensions.last || ""
|
61
|
+
end
|
62
|
+
|
63
|
+
# Public: Return an array of the file extensions
|
64
|
+
#
|
65
|
+
# >> Linguist::FileBlob.new("app/views/things/index.html.erb").extensions
|
66
|
+
# => [".html.erb", ".erb"]
|
67
|
+
#
|
68
|
+
# Returns an Array
|
69
|
+
def extensions
|
70
|
+
basename, *segments = File.basename(name).split(".")
|
71
|
+
|
72
|
+
segments.map.with_index do |segment, index|
|
73
|
+
"." + segments[index..-1].join(".")
|
68
74
|
end
|
69
75
|
end
|
70
76
|
end
|
data/lib/linguist/heuristics.rb
CHANGED
@@ -1,158 +1,160 @@
|
|
1
1
|
module Linguist
|
2
2
|
# A collection of simple heuristics that can be used to better analyze languages.
|
3
3
|
class Heuristics
|
4
|
-
|
5
|
-
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
4
|
+
# Public: Use heuristics to detect language of the blob.
|
5
|
+
#
|
6
|
+
# blob - An object that quacks like a blob.
|
7
|
+
# possible_languages - Array of Language objects
|
8
|
+
#
|
9
|
+
# Examples
|
9
10
|
#
|
10
|
-
#
|
11
|
-
#
|
11
|
+
# Heuristics.call(FileBlob.new("path/to/file"), [
|
12
|
+
# Language["Ruby"], Language["Python"]
|
13
|
+
# ])
|
12
14
|
#
|
13
|
-
# Returns an
|
14
|
-
def self.
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
result = disambiguate_pl(data)
|
20
|
-
end
|
21
|
-
if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
|
22
|
-
result = disambiguate_ecl(data)
|
23
|
-
end
|
24
|
-
if languages.all? { |l| ["IDL", "Prolog"].include?(l) }
|
25
|
-
result = disambiguate_pro(data)
|
26
|
-
end
|
27
|
-
if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
|
28
|
-
result = disambiguate_cl(data)
|
29
|
-
end
|
30
|
-
if languages.all? { |l| ["Hack", "PHP"].include?(l) }
|
31
|
-
result = disambiguate_hack(data)
|
32
|
-
end
|
33
|
-
if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) }
|
34
|
-
result = disambiguate_sc(data)
|
35
|
-
end
|
36
|
-
if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) }
|
37
|
-
result = disambiguate_asc(data)
|
38
|
-
end
|
39
|
-
if languages.all? { |l| ["FORTRAN", "Forth"].include?(l) }
|
40
|
-
result = disambiguate_f(data)
|
41
|
-
end
|
42
|
-
return result
|
15
|
+
# Returns an Array of languages, or empty if none matched or were inconclusive.
|
16
|
+
def self.call(blob, languages)
|
17
|
+
data = blob.data
|
18
|
+
|
19
|
+
@heuristics.each do |heuristic|
|
20
|
+
return Array(heuristic.call(data)) if heuristic.matches?(languages)
|
43
21
|
end
|
22
|
+
|
23
|
+
[] # No heuristics matched
|
44
24
|
end
|
45
25
|
|
46
|
-
#
|
47
|
-
# We want to shortcut look for Objective-C _and_ now C++ too!
|
26
|
+
# Internal: Define a new heuristic.
|
48
27
|
#
|
49
|
-
#
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
28
|
+
# languages - String names of languages to disambiguate.
|
29
|
+
# heuristic - Block which takes data as an argument and returns a Language or nil.
|
30
|
+
#
|
31
|
+
# Examples
|
32
|
+
#
|
33
|
+
# disambiguate "Perl", "Prolog" do |data|
|
34
|
+
# if data.include?("use strict")
|
35
|
+
# Language["Perl"]
|
36
|
+
# elsif data.include?(":-")
|
37
|
+
# Language["Prolog"]
|
38
|
+
# end
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
def self.disambiguate(*languages, &heuristic)
|
42
|
+
@heuristics << new(languages, &heuristic)
|
43
|
+
end
|
44
|
+
|
45
|
+
# Internal: Array of defined heuristics
|
46
|
+
@heuristics = []
|
47
|
+
|
48
|
+
# Internal
|
49
|
+
def initialize(languages, &heuristic)
|
50
|
+
@languages = languages
|
51
|
+
@heuristic = heuristic
|
52
|
+
end
|
53
|
+
|
54
|
+
# Internal: Check if this heuristic matches the candidate languages.
|
55
|
+
def matches?(candidates)
|
56
|
+
candidates.all? { |l| @languages.include?(l.name) }
|
57
|
+
end
|
58
|
+
|
59
|
+
# Internal: Perform the heuristic
|
60
|
+
def call(data)
|
61
|
+
@heuristic.call(data)
|
62
|
+
end
|
63
|
+
|
64
|
+
disambiguate "Objective-C", "C++", "C" do |data|
|
65
|
+
if (/@(interface|class|protocol|property|end|synchronised|selector|implementation)\b/.match(data))
|
66
|
+
Language["Objective-C"]
|
67
|
+
elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
|
68
|
+
/^\s*template\s*</.match(data) || /^[^@]class\s+\w+/.match(data) || /^[^@](private|public|protected):$/.match(data) || /std::.+$/.match(data))
|
69
|
+
Language["C++"]
|
56
70
|
end
|
57
|
-
matches
|
58
71
|
end
|
59
72
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
73
|
+
disambiguate "Perl", "Perl6", "Prolog" do |data|
|
74
|
+
if data.include?("use v6")
|
75
|
+
Language["Perl6"]
|
76
|
+
elsif data.include?("use strict")
|
77
|
+
Language["Perl"]
|
64
78
|
elsif data.include?(":-")
|
65
|
-
|
79
|
+
Language["Prolog"]
|
66
80
|
end
|
67
|
-
matches
|
68
81
|
end
|
69
82
|
|
70
|
-
|
71
|
-
matches = []
|
83
|
+
disambiguate "ECL", "Prolog" do |data|
|
72
84
|
if data.include?(":-")
|
73
|
-
|
85
|
+
Language["Prolog"]
|
74
86
|
elsif data.include?(":=")
|
75
|
-
|
87
|
+
Language["ECL"]
|
76
88
|
end
|
77
|
-
matches
|
78
89
|
end
|
79
90
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
matches << Language["Prolog"]
|
84
|
-
else
|
85
|
-
matches << Language["IDL"]
|
86
|
-
end
|
87
|
-
matches
|
88
|
-
end
|
89
|
-
|
90
|
-
def self.disambiguate_ts(data)
|
91
|
-
matches = []
|
92
|
-
if (data.include?("</translation>"))
|
93
|
-
matches << Language["XML"]
|
91
|
+
disambiguate "IDL", "Prolog" do |data|
|
92
|
+
if data.include?(":-")
|
93
|
+
Language["Prolog"]
|
94
94
|
else
|
95
|
-
|
95
|
+
Language["IDL"]
|
96
96
|
end
|
97
|
-
matches
|
98
97
|
end
|
99
98
|
|
100
|
-
|
101
|
-
matches = []
|
99
|
+
disambiguate "Common Lisp", "OpenCL", "Cool" do |data|
|
102
100
|
if data.include?("(defun ")
|
103
|
-
|
101
|
+
Language["Common Lisp"]
|
102
|
+
elsif /^class/x.match(data)
|
103
|
+
Language["Cool"]
|
104
104
|
elsif /\/\* |\/\/ |^\}/.match(data)
|
105
|
-
|
105
|
+
Language["OpenCL"]
|
106
106
|
end
|
107
|
-
matches
|
108
|
-
end
|
109
|
-
|
110
|
-
def self.disambiguate_r(data)
|
111
|
-
matches = []
|
112
|
-
matches << Language["Rebol"] if /\bRebol\b/i.match(data)
|
113
|
-
matches << Language["R"] if data.include?("<-")
|
114
|
-
matches
|
115
107
|
end
|
116
108
|
|
117
|
-
|
118
|
-
matches = []
|
109
|
+
disambiguate "Hack", "PHP" do |data|
|
119
110
|
if data.include?("<?hh")
|
120
|
-
|
111
|
+
Language["Hack"]
|
121
112
|
elsif /<?[^h]/.match(data)
|
122
|
-
|
113
|
+
Language["PHP"]
|
123
114
|
end
|
124
|
-
matches
|
125
115
|
end
|
126
116
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
if (/^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data))
|
133
|
-
matches << Language["Scala"]
|
117
|
+
disambiguate "Scala", "SuperCollider" do |data|
|
118
|
+
if /\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data)
|
119
|
+
Language["SuperCollider"]
|
120
|
+
elsif /^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data)
|
121
|
+
Language["Scala"]
|
134
122
|
end
|
135
|
-
matches
|
136
123
|
end
|
137
124
|
|
138
|
-
|
139
|
-
|
140
|
-
matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
|
141
|
-
matches
|
125
|
+
disambiguate "AsciiDoc", "AGS Script" do |data|
|
126
|
+
Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
|
142
127
|
end
|
143
128
|
|
144
|
-
|
145
|
-
matches = []
|
129
|
+
disambiguate "FORTRAN", "Forth" do |data|
|
146
130
|
if /^: /.match(data)
|
147
|
-
|
131
|
+
Language["Forth"]
|
148
132
|
elsif /^([c*][^a-z]| subroutine\s)/i.match(data)
|
149
|
-
|
133
|
+
Language["FORTRAN"]
|
150
134
|
end
|
151
|
-
matches
|
152
135
|
end
|
153
136
|
|
154
|
-
|
155
|
-
|
137
|
+
disambiguate "F#", "Forth", "GLSL" do |data|
|
138
|
+
if /^(: |new-device)/.match(data)
|
139
|
+
Language["Forth"]
|
140
|
+
elsif /^(#light|import|let|module|namespace|open|type)/.match(data)
|
141
|
+
Language["F#"]
|
142
|
+
elsif /^(#include|#pragma|precision|uniform|varying|void)/.match(data)
|
143
|
+
Language["GLSL"]
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
disambiguate "Gosu", "JavaScript" do |data|
|
148
|
+
Language["Gosu"] if /^uses java\./.match(data)
|
149
|
+
end
|
150
|
+
|
151
|
+
disambiguate "LoomScript", "LiveScript" do |data|
|
152
|
+
if /^\s*package\s*[\w\.\/\*\s]*\s*{/.match(data)
|
153
|
+
Language["LoomScript"]
|
154
|
+
else
|
155
|
+
Language["LiveScript"]
|
156
|
+
end
|
156
157
|
end
|
158
|
+
|
157
159
|
end
|
158
160
|
end
|
data/lib/linguist/language.rb
CHANGED
@@ -10,6 +10,8 @@ require 'linguist/heuristics'
|
|
10
10
|
require 'linguist/samples'
|
11
11
|
require 'linguist/file_blob'
|
12
12
|
require 'linguist/blob_helper'
|
13
|
+
require 'linguist/strategy/filename'
|
14
|
+
require 'linguist/shebang'
|
13
15
|
|
14
16
|
module Linguist
|
15
17
|
# Language names that are recognizable by GitHub. Defined languages
|
@@ -91,6 +93,13 @@ module Linguist
|
|
91
93
|
language
|
92
94
|
end
|
93
95
|
|
96
|
+
STRATEGIES = [
|
97
|
+
Linguist::Strategy::Filename,
|
98
|
+
Linguist::Shebang,
|
99
|
+
Linguist::Heuristics,
|
100
|
+
Linguist::Classifier
|
101
|
+
]
|
102
|
+
|
94
103
|
# Public: Detects the Language of the blob.
|
95
104
|
#
|
96
105
|
# blob - an object that includes the Linguist `BlobHelper` interface;
|
@@ -98,49 +107,22 @@ module Linguist
|
|
98
107
|
#
|
99
108
|
# Returns Language or nil.
|
100
109
|
def self.detect(blob)
|
101
|
-
name = blob.name.to_s
|
102
|
-
|
103
110
|
# Bail early if the blob is binary or empty.
|
104
111
|
return nil if blob.likely_binary? || blob.binary? || blob.empty?
|
105
112
|
|
106
|
-
#
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
# If there is more than one possible language with that extension (or no
|
118
|
-
# extension at all, in the case of extensionless scripts), we need to continue
|
119
|
-
# our detection work
|
120
|
-
if possible_languages.length > 1
|
121
|
-
data = blob.data
|
122
|
-
possible_language_names = possible_languages.map(&:name)
|
123
|
-
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
|
124
|
-
|
125
|
-
if heuristic_languages.size > 1
|
126
|
-
possible_language_names = heuristic_languages.map(&:name)
|
127
|
-
end
|
128
|
-
|
129
|
-
# Check if there's a shebang line and use that as authoritative
|
130
|
-
if (result = find_by_shebang(data)) && !result.empty?
|
131
|
-
result.first
|
132
|
-
# No shebang. Still more work to do. Try to find it with our heuristics.
|
133
|
-
elsif heuristic_languages.size == 1
|
134
|
-
heuristic_languages.first
|
135
|
-
# Lastly, fall back to the probabilistic classifier.
|
136
|
-
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
|
137
|
-
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
138
|
-
Language[classified[0]]
|
113
|
+
# Call each strategy until one candidate is returned.
|
114
|
+
STRATEGIES.reduce([]) do |languages, strategy|
|
115
|
+
candidates = strategy.call(blob, languages)
|
116
|
+
if candidates.size == 1
|
117
|
+
return candidates.first
|
118
|
+
elsif candidates.size > 1
|
119
|
+
# More than one candidate was found, pass them to the next strategy.
|
120
|
+
candidates
|
121
|
+
else
|
122
|
+
# No candiates were found, pass on languages from the previous strategy.
|
123
|
+
languages
|
139
124
|
end
|
140
|
-
|
141
|
-
# Simplest and most common case, we can just return the one match based on extension
|
142
|
-
possible_languages.first
|
143
|
-
end
|
125
|
+
end.first
|
144
126
|
end
|
145
127
|
|
146
128
|
# Public: Get all Languages
|
@@ -190,8 +172,13 @@ module Linguist
|
|
190
172
|
# Returns all matching Languages or [] if none were found.
|
191
173
|
def self.find_by_filename(filename)
|
192
174
|
basename = File.basename(filename)
|
193
|
-
|
194
|
-
|
175
|
+
|
176
|
+
# find the first extension with language definitions
|
177
|
+
extname = FileBlob.new(filename).extensions.detect do |e|
|
178
|
+
!@extension_index[e].empty?
|
179
|
+
end
|
180
|
+
|
181
|
+
(@filename_index[basename] + @extension_index[extname]).compact.uniq
|
195
182
|
end
|
196
183
|
|
197
184
|
# Public: Look up Languages by file extension.
|
@@ -212,20 +199,26 @@ module Linguist
|
|
212
199
|
@extension_index[extname]
|
213
200
|
end
|
214
201
|
|
215
|
-
#
|
202
|
+
# DEPRECATED
|
203
|
+
def self.find_by_shebang(data)
|
204
|
+
@interpreter_index[Shebang.interpreter(data)]
|
205
|
+
end
|
206
|
+
|
207
|
+
# Public: Look up Languages by interpreter.
|
216
208
|
#
|
217
|
-
#
|
209
|
+
# interpreter - String of interpreter name
|
218
210
|
#
|
219
211
|
# Examples
|
220
212
|
#
|
221
|
-
# Language.
|
213
|
+
# Language.find_by_interpreter("bash")
|
222
214
|
# # => [#<Language name="Bash">]
|
223
215
|
#
|
224
216
|
# Returns the matching Language
|
225
|
-
def self.
|
226
|
-
@interpreter_index[
|
217
|
+
def self.find_by_interpreter(interpreter)
|
218
|
+
@interpreter_index[interpreter]
|
227
219
|
end
|
228
220
|
|
221
|
+
|
229
222
|
# Public: Look up Language by its name or lexer.
|
230
223
|
#
|
231
224
|
# name - The String name of the Language
|