github-linguist 4.0.3 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/linguist.rb +1 -0
- data/lib/linguist/classifier.rb +19 -0
- data/lib/linguist/file_blob.rb +14 -8
- data/lib/linguist/heuristics.rb +112 -110
- data/lib/linguist/language.rb +39 -46
- data/lib/linguist/languages.json +1 -1
- data/lib/linguist/languages.yml +77 -6
- data/lib/linguist/samples.json +3292 -454
- data/lib/linguist/samples.rb +6 -39
- data/lib/linguist/shebang.rb +44 -0
- data/lib/linguist/strategy/filename.rb +20 -0
- data/lib/linguist/vendor.yml +0 -3
- data/lib/linguist/version.rb +1 -1
- metadata +6 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4fd8402379e8ac3de17921cf9831e6db303bea33
|
4
|
+
data.tar.gz: 546482b4f73f6c6a512b0258e0768aeb51fdbfb1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 73d9c3e80b0884a3ef96f5281028bf943f183de4f11aecd5ef31b986fb8f13d4fa70a38a475a0ca6ba0e3aa20cd3b8965edd41e5f26f15e35ae375c30974382b
|
7
|
+
data.tar.gz: 18d6610244f861a7c4f925b1332370b39901fb7f640efba8b5a83ca7f80321aa17cf5757dd532b7c54206b1c2b66e99c00aafe2213aefcd695205af7afc01bff
|
data/lib/linguist.rb
CHANGED
data/lib/linguist/classifier.rb
CHANGED
@@ -3,6 +3,25 @@ require 'linguist/tokenizer'
|
|
3
3
|
module Linguist
|
4
4
|
# Language bayesian classifier.
|
5
5
|
class Classifier
|
6
|
+
# Public: Use the classifier to detect language of the blob.
|
7
|
+
#
|
8
|
+
# blob - An object that quacks like a blob.
|
9
|
+
# possible_languages - Array of Language objects
|
10
|
+
#
|
11
|
+
# Examples
|
12
|
+
#
|
13
|
+
# Classifier.call(FileBlob.new("path/to/file"), [
|
14
|
+
# Language["Ruby"], Language["Python"]
|
15
|
+
# ])
|
16
|
+
#
|
17
|
+
# Returns an Array of Language objects, most probable first.
|
18
|
+
def self.call(blob, possible_languages)
|
19
|
+
language_names = possible_languages.map(&:name)
|
20
|
+
classify(Samples.cache, blob.data, language_names).map do |name, _|
|
21
|
+
Language[name] # Return the actual Language objects
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
6
25
|
# Public: Train classifier that data is a certain language.
|
7
26
|
#
|
8
27
|
# db - Hash classifier database object
|
data/lib/linguist/file_blob.rb
CHANGED
@@ -57,14 +57,20 @@ module Linguist
|
|
57
57
|
#
|
58
58
|
# Returns a String.
|
59
59
|
def extension
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
60
|
+
extensions.last || ""
|
61
|
+
end
|
62
|
+
|
63
|
+
# Public: Return an array of the file extensions
|
64
|
+
#
|
65
|
+
# >> Linguist::FileBlob.new("app/views/things/index.html.erb").extensions
|
66
|
+
# => [".html.erb", ".erb"]
|
67
|
+
#
|
68
|
+
# Returns an Array
|
69
|
+
def extensions
|
70
|
+
basename, *segments = File.basename(name).split(".")
|
71
|
+
|
72
|
+
segments.map.with_index do |segment, index|
|
73
|
+
"." + segments[index..-1].join(".")
|
68
74
|
end
|
69
75
|
end
|
70
76
|
end
|
data/lib/linguist/heuristics.rb
CHANGED
@@ -1,158 +1,160 @@
|
|
1
1
|
module Linguist
|
2
2
|
# A collection of simple heuristics that can be used to better analyze languages.
|
3
3
|
class Heuristics
|
4
|
-
|
5
|
-
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
4
|
+
# Public: Use heuristics to detect language of the blob.
|
5
|
+
#
|
6
|
+
# blob - An object that quacks like a blob.
|
7
|
+
# possible_languages - Array of Language objects
|
8
|
+
#
|
9
|
+
# Examples
|
9
10
|
#
|
10
|
-
#
|
11
|
-
#
|
11
|
+
# Heuristics.call(FileBlob.new("path/to/file"), [
|
12
|
+
# Language["Ruby"], Language["Python"]
|
13
|
+
# ])
|
12
14
|
#
|
13
|
-
# Returns an
|
14
|
-
def self.
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
result = disambiguate_pl(data)
|
20
|
-
end
|
21
|
-
if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
|
22
|
-
result = disambiguate_ecl(data)
|
23
|
-
end
|
24
|
-
if languages.all? { |l| ["IDL", "Prolog"].include?(l) }
|
25
|
-
result = disambiguate_pro(data)
|
26
|
-
end
|
27
|
-
if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
|
28
|
-
result = disambiguate_cl(data)
|
29
|
-
end
|
30
|
-
if languages.all? { |l| ["Hack", "PHP"].include?(l) }
|
31
|
-
result = disambiguate_hack(data)
|
32
|
-
end
|
33
|
-
if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) }
|
34
|
-
result = disambiguate_sc(data)
|
35
|
-
end
|
36
|
-
if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) }
|
37
|
-
result = disambiguate_asc(data)
|
38
|
-
end
|
39
|
-
if languages.all? { |l| ["FORTRAN", "Forth"].include?(l) }
|
40
|
-
result = disambiguate_f(data)
|
41
|
-
end
|
42
|
-
return result
|
15
|
+
# Returns an Array of languages, or empty if none matched or were inconclusive.
|
16
|
+
def self.call(blob, languages)
|
17
|
+
data = blob.data
|
18
|
+
|
19
|
+
@heuristics.each do |heuristic|
|
20
|
+
return Array(heuristic.call(data)) if heuristic.matches?(languages)
|
43
21
|
end
|
22
|
+
|
23
|
+
[] # No heuristics matched
|
44
24
|
end
|
45
25
|
|
46
|
-
#
|
47
|
-
# We want to shortcut look for Objective-C _and_ now C++ too!
|
26
|
+
# Internal: Define a new heuristic.
|
48
27
|
#
|
49
|
-
#
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
28
|
+
# languages - String names of languages to disambiguate.
|
29
|
+
# heuristic - Block which takes data as an argument and returns a Language or nil.
|
30
|
+
#
|
31
|
+
# Examples
|
32
|
+
#
|
33
|
+
# disambiguate "Perl", "Prolog" do |data|
|
34
|
+
# if data.include?("use strict")
|
35
|
+
# Language["Perl"]
|
36
|
+
# elsif data.include?(":-")
|
37
|
+
# Language["Prolog"]
|
38
|
+
# end
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
def self.disambiguate(*languages, &heuristic)
|
42
|
+
@heuristics << new(languages, &heuristic)
|
43
|
+
end
|
44
|
+
|
45
|
+
# Internal: Array of defined heuristics
|
46
|
+
@heuristics = []
|
47
|
+
|
48
|
+
# Internal
|
49
|
+
def initialize(languages, &heuristic)
|
50
|
+
@languages = languages
|
51
|
+
@heuristic = heuristic
|
52
|
+
end
|
53
|
+
|
54
|
+
# Internal: Check if this heuristic matches the candidate languages.
|
55
|
+
def matches?(candidates)
|
56
|
+
candidates.all? { |l| @languages.include?(l.name) }
|
57
|
+
end
|
58
|
+
|
59
|
+
# Internal: Perform the heuristic
|
60
|
+
def call(data)
|
61
|
+
@heuristic.call(data)
|
62
|
+
end
|
63
|
+
|
64
|
+
disambiguate "Objective-C", "C++", "C" do |data|
|
65
|
+
if (/@(interface|class|protocol|property|end|synchronised|selector|implementation)\b/.match(data))
|
66
|
+
Language["Objective-C"]
|
67
|
+
elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
|
68
|
+
/^\s*template\s*</.match(data) || /^[^@]class\s+\w+/.match(data) || /^[^@](private|public|protected):$/.match(data) || /std::.+$/.match(data))
|
69
|
+
Language["C++"]
|
56
70
|
end
|
57
|
-
matches
|
58
71
|
end
|
59
72
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
73
|
+
disambiguate "Perl", "Perl6", "Prolog" do |data|
|
74
|
+
if data.include?("use v6")
|
75
|
+
Language["Perl6"]
|
76
|
+
elsif data.include?("use strict")
|
77
|
+
Language["Perl"]
|
64
78
|
elsif data.include?(":-")
|
65
|
-
|
79
|
+
Language["Prolog"]
|
66
80
|
end
|
67
|
-
matches
|
68
81
|
end
|
69
82
|
|
70
|
-
|
71
|
-
matches = []
|
83
|
+
disambiguate "ECL", "Prolog" do |data|
|
72
84
|
if data.include?(":-")
|
73
|
-
|
85
|
+
Language["Prolog"]
|
74
86
|
elsif data.include?(":=")
|
75
|
-
|
87
|
+
Language["ECL"]
|
76
88
|
end
|
77
|
-
matches
|
78
89
|
end
|
79
90
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
matches << Language["Prolog"]
|
84
|
-
else
|
85
|
-
matches << Language["IDL"]
|
86
|
-
end
|
87
|
-
matches
|
88
|
-
end
|
89
|
-
|
90
|
-
def self.disambiguate_ts(data)
|
91
|
-
matches = []
|
92
|
-
if (data.include?("</translation>"))
|
93
|
-
matches << Language["XML"]
|
91
|
+
disambiguate "IDL", "Prolog" do |data|
|
92
|
+
if data.include?(":-")
|
93
|
+
Language["Prolog"]
|
94
94
|
else
|
95
|
-
|
95
|
+
Language["IDL"]
|
96
96
|
end
|
97
|
-
matches
|
98
97
|
end
|
99
98
|
|
100
|
-
|
101
|
-
matches = []
|
99
|
+
disambiguate "Common Lisp", "OpenCL", "Cool" do |data|
|
102
100
|
if data.include?("(defun ")
|
103
|
-
|
101
|
+
Language["Common Lisp"]
|
102
|
+
elsif /^class/x.match(data)
|
103
|
+
Language["Cool"]
|
104
104
|
elsif /\/\* |\/\/ |^\}/.match(data)
|
105
|
-
|
105
|
+
Language["OpenCL"]
|
106
106
|
end
|
107
|
-
matches
|
108
|
-
end
|
109
|
-
|
110
|
-
def self.disambiguate_r(data)
|
111
|
-
matches = []
|
112
|
-
matches << Language["Rebol"] if /\bRebol\b/i.match(data)
|
113
|
-
matches << Language["R"] if data.include?("<-")
|
114
|
-
matches
|
115
107
|
end
|
116
108
|
|
117
|
-
|
118
|
-
matches = []
|
109
|
+
disambiguate "Hack", "PHP" do |data|
|
119
110
|
if data.include?("<?hh")
|
120
|
-
|
111
|
+
Language["Hack"]
|
121
112
|
elsif /<?[^h]/.match(data)
|
122
|
-
|
113
|
+
Language["PHP"]
|
123
114
|
end
|
124
|
-
matches
|
125
115
|
end
|
126
116
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
if (/^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data))
|
133
|
-
matches << Language["Scala"]
|
117
|
+
disambiguate "Scala", "SuperCollider" do |data|
|
118
|
+
if /\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data)
|
119
|
+
Language["SuperCollider"]
|
120
|
+
elsif /^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data)
|
121
|
+
Language["Scala"]
|
134
122
|
end
|
135
|
-
matches
|
136
123
|
end
|
137
124
|
|
138
|
-
|
139
|
-
|
140
|
-
matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
|
141
|
-
matches
|
125
|
+
disambiguate "AsciiDoc", "AGS Script" do |data|
|
126
|
+
Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
|
142
127
|
end
|
143
128
|
|
144
|
-
|
145
|
-
matches = []
|
129
|
+
disambiguate "FORTRAN", "Forth" do |data|
|
146
130
|
if /^: /.match(data)
|
147
|
-
|
131
|
+
Language["Forth"]
|
148
132
|
elsif /^([c*][^a-z]| subroutine\s)/i.match(data)
|
149
|
-
|
133
|
+
Language["FORTRAN"]
|
150
134
|
end
|
151
|
-
matches
|
152
135
|
end
|
153
136
|
|
154
|
-
|
155
|
-
|
137
|
+
disambiguate "F#", "Forth", "GLSL" do |data|
|
138
|
+
if /^(: |new-device)/.match(data)
|
139
|
+
Language["Forth"]
|
140
|
+
elsif /^(#light|import|let|module|namespace|open|type)/.match(data)
|
141
|
+
Language["F#"]
|
142
|
+
elsif /^(#include|#pragma|precision|uniform|varying|void)/.match(data)
|
143
|
+
Language["GLSL"]
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
disambiguate "Gosu", "JavaScript" do |data|
|
148
|
+
Language["Gosu"] if /^uses java\./.match(data)
|
149
|
+
end
|
150
|
+
|
151
|
+
disambiguate "LoomScript", "LiveScript" do |data|
|
152
|
+
if /^\s*package\s*[\w\.\/\*\s]*\s*{/.match(data)
|
153
|
+
Language["LoomScript"]
|
154
|
+
else
|
155
|
+
Language["LiveScript"]
|
156
|
+
end
|
156
157
|
end
|
158
|
+
|
157
159
|
end
|
158
160
|
end
|
data/lib/linguist/language.rb
CHANGED
@@ -10,6 +10,8 @@ require 'linguist/heuristics'
|
|
10
10
|
require 'linguist/samples'
|
11
11
|
require 'linguist/file_blob'
|
12
12
|
require 'linguist/blob_helper'
|
13
|
+
require 'linguist/strategy/filename'
|
14
|
+
require 'linguist/shebang'
|
13
15
|
|
14
16
|
module Linguist
|
15
17
|
# Language names that are recognizable by GitHub. Defined languages
|
@@ -91,6 +93,13 @@ module Linguist
|
|
91
93
|
language
|
92
94
|
end
|
93
95
|
|
96
|
+
STRATEGIES = [
|
97
|
+
Linguist::Strategy::Filename,
|
98
|
+
Linguist::Shebang,
|
99
|
+
Linguist::Heuristics,
|
100
|
+
Linguist::Classifier
|
101
|
+
]
|
102
|
+
|
94
103
|
# Public: Detects the Language of the blob.
|
95
104
|
#
|
96
105
|
# blob - an object that includes the Linguist `BlobHelper` interface;
|
@@ -98,49 +107,22 @@ module Linguist
|
|
98
107
|
#
|
99
108
|
# Returns Language or nil.
|
100
109
|
def self.detect(blob)
|
101
|
-
name = blob.name.to_s
|
102
|
-
|
103
110
|
# Bail early if the blob is binary or empty.
|
104
111
|
return nil if blob.likely_binary? || blob.binary? || blob.empty?
|
105
112
|
|
106
|
-
#
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
# If there is more than one possible language with that extension (or no
|
118
|
-
# extension at all, in the case of extensionless scripts), we need to continue
|
119
|
-
# our detection work
|
120
|
-
if possible_languages.length > 1
|
121
|
-
data = blob.data
|
122
|
-
possible_language_names = possible_languages.map(&:name)
|
123
|
-
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
|
124
|
-
|
125
|
-
if heuristic_languages.size > 1
|
126
|
-
possible_language_names = heuristic_languages.map(&:name)
|
127
|
-
end
|
128
|
-
|
129
|
-
# Check if there's a shebang line and use that as authoritative
|
130
|
-
if (result = find_by_shebang(data)) && !result.empty?
|
131
|
-
result.first
|
132
|
-
# No shebang. Still more work to do. Try to find it with our heuristics.
|
133
|
-
elsif heuristic_languages.size == 1
|
134
|
-
heuristic_languages.first
|
135
|
-
# Lastly, fall back to the probabilistic classifier.
|
136
|
-
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
|
137
|
-
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
138
|
-
Language[classified[0]]
|
113
|
+
# Call each strategy until one candidate is returned.
|
114
|
+
STRATEGIES.reduce([]) do |languages, strategy|
|
115
|
+
candidates = strategy.call(blob, languages)
|
116
|
+
if candidates.size == 1
|
117
|
+
return candidates.first
|
118
|
+
elsif candidates.size > 1
|
119
|
+
# More than one candidate was found, pass them to the next strategy.
|
120
|
+
candidates
|
121
|
+
else
|
122
|
+
# No candiates were found, pass on languages from the previous strategy.
|
123
|
+
languages
|
139
124
|
end
|
140
|
-
|
141
|
-
# Simplest and most common case, we can just return the one match based on extension
|
142
|
-
possible_languages.first
|
143
|
-
end
|
125
|
+
end.first
|
144
126
|
end
|
145
127
|
|
146
128
|
# Public: Get all Languages
|
@@ -190,8 +172,13 @@ module Linguist
|
|
190
172
|
# Returns all matching Languages or [] if none were found.
|
191
173
|
def self.find_by_filename(filename)
|
192
174
|
basename = File.basename(filename)
|
193
|
-
|
194
|
-
|
175
|
+
|
176
|
+
# find the first extension with language definitions
|
177
|
+
extname = FileBlob.new(filename).extensions.detect do |e|
|
178
|
+
!@extension_index[e].empty?
|
179
|
+
end
|
180
|
+
|
181
|
+
(@filename_index[basename] + @extension_index[extname]).compact.uniq
|
195
182
|
end
|
196
183
|
|
197
184
|
# Public: Look up Languages by file extension.
|
@@ -212,20 +199,26 @@ module Linguist
|
|
212
199
|
@extension_index[extname]
|
213
200
|
end
|
214
201
|
|
215
|
-
#
|
202
|
+
# DEPRECATED
|
203
|
+
def self.find_by_shebang(data)
|
204
|
+
@interpreter_index[Shebang.interpreter(data)]
|
205
|
+
end
|
206
|
+
|
207
|
+
# Public: Look up Languages by interpreter.
|
216
208
|
#
|
217
|
-
#
|
209
|
+
# interpreter - String of interpreter name
|
218
210
|
#
|
219
211
|
# Examples
|
220
212
|
#
|
221
|
-
# Language.
|
213
|
+
# Language.find_by_interpreter("bash")
|
222
214
|
# # => [#<Language name="Bash">]
|
223
215
|
#
|
224
216
|
# Returns the matching Language
|
225
|
-
def self.
|
226
|
-
@interpreter_index[
|
217
|
+
def self.find_by_interpreter(interpreter)
|
218
|
+
@interpreter_index[interpreter]
|
227
219
|
end
|
228
220
|
|
221
|
+
|
229
222
|
# Public: Look up Language by its name or lexer.
|
230
223
|
#
|
231
224
|
# name - The String name of the Language
|