github-linguist 4.0.3 → 4.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c83f59f5d07b0f9d0b233205ad068fdce8977d9d
4
- data.tar.gz: 255fb8be50d94c14dde66750ebdc262a52538413
3
+ metadata.gz: 4fd8402379e8ac3de17921cf9831e6db303bea33
4
+ data.tar.gz: 546482b4f73f6c6a512b0258e0768aeb51fdbfb1
5
5
  SHA512:
6
- metadata.gz: f3d65aaf387f84f956dffd975d2f5e9869f3f2631a429b1d4819525ffdf2816783a7324d9b66821dedd7080c9f9718b58201fad0d04747fe3a83136151ef547a
7
- data.tar.gz: c1f9d19147a6e1ae946dd75222f45ed72e59b0643953c3365c2547a2f7d6847639176aba6c14c69a00cfcca0cd61794d63afefdcc773395efca4c7f2112e533d
6
+ metadata.gz: 73d9c3e80b0884a3ef96f5281028bf943f183de4f11aecd5ef31b986fb8f13d4fa70a38a475a0ca6ba0e3aa20cd3b8965edd41e5f26f15e35ae375c30974382b
7
+ data.tar.gz: 18d6610244f861a7c4f925b1332370b39901fb7f640efba8b5a83ca7f80321aa17cf5757dd532b7c54206b1c2b66e99c00aafe2213aefcd695205af7afc01bff
data/lib/linguist.rb CHANGED
@@ -4,4 +4,5 @@ require 'linguist/heuristics'
4
4
  require 'linguist/language'
5
5
  require 'linguist/repository'
6
6
  require 'linguist/samples'
7
+ require 'linguist/shebang'
7
8
  require 'linguist/version'
@@ -3,6 +3,25 @@ require 'linguist/tokenizer'
3
3
  module Linguist
4
4
  # Language bayesian classifier.
5
5
  class Classifier
6
+ # Public: Use the classifier to detect language of the blob.
7
+ #
8
+ # blob - An object that quacks like a blob.
9
+ # possible_languages - Array of Language objects
10
+ #
11
+ # Examples
12
+ #
13
+ # Classifier.call(FileBlob.new("path/to/file"), [
14
+ # Language["Ruby"], Language["Python"]
15
+ # ])
16
+ #
17
+ # Returns an Array of Language objects, most probable first.
18
+ def self.call(blob, possible_languages)
19
+ language_names = possible_languages.map(&:name)
20
+ classify(Samples.cache, blob.data, language_names).map do |name, _|
21
+ Language[name] # Return the actual Language objects
22
+ end
23
+ end
24
+
6
25
  # Public: Train classifier that data is a certain language.
7
26
  #
8
27
  # db - Hash classifier database object
@@ -57,14 +57,20 @@ module Linguist
57
57
  #
58
58
  # Returns a String.
59
59
  def extension
60
- # File.extname returns nil if the filename is an extension.
61
- extension = File.extname(name)
62
- basename = File.basename(name)
63
- # Checks if the filename is an extension.
64
- if extension.empty? && basename[0] == "."
65
- basename
66
- else
67
- extension
60
+ extensions.last || ""
61
+ end
62
+
63
+ # Public: Return an array of the file extensions
64
+ #
65
+ # >> Linguist::FileBlob.new("app/views/things/index.html.erb").extensions
66
+ # => [".html.erb", ".erb"]
67
+ #
68
+ # Returns an Array
69
+ def extensions
70
+ basename, *segments = File.basename(name).split(".")
71
+
72
+ segments.map.with_index do |segment, index|
73
+ "." + segments[index..-1].join(".")
68
74
  end
69
75
  end
70
76
  end
@@ -1,158 +1,160 @@
1
1
  module Linguist
2
2
  # A collection of simple heuristics that can be used to better analyze languages.
3
3
  class Heuristics
4
- ACTIVE = true
5
-
6
- # Public: Given an array of String language names,
7
- # apply heuristics against the given data and return an array
8
- # of matching languages, or nil.
4
+ # Public: Use heuristics to detect language of the blob.
5
+ #
6
+ # blob - An object that quacks like a blob.
7
+ # possible_languages - Array of Language objects
8
+ #
9
+ # Examples
9
10
  #
10
- # data - Array of tokens or String data to analyze.
11
- # languages - Array of language name Strings to restrict to.
11
+ # Heuristics.call(FileBlob.new("path/to/file"), [
12
+ # Language["Ruby"], Language["Python"]
13
+ # ])
12
14
  #
13
- # Returns an array of Languages or []
14
- def self.find_by_heuristics(data, languages)
15
- if active?
16
- result = []
17
-
18
- if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
19
- result = disambiguate_pl(data)
20
- end
21
- if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
22
- result = disambiguate_ecl(data)
23
- end
24
- if languages.all? { |l| ["IDL", "Prolog"].include?(l) }
25
- result = disambiguate_pro(data)
26
- end
27
- if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
28
- result = disambiguate_cl(data)
29
- end
30
- if languages.all? { |l| ["Hack", "PHP"].include?(l) }
31
- result = disambiguate_hack(data)
32
- end
33
- if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) }
34
- result = disambiguate_sc(data)
35
- end
36
- if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) }
37
- result = disambiguate_asc(data)
38
- end
39
- if languages.all? { |l| ["FORTRAN", "Forth"].include?(l) }
40
- result = disambiguate_f(data)
41
- end
42
- return result
15
+ # Returns an Array of languages, or empty if none matched or were inconclusive.
16
+ def self.call(blob, languages)
17
+ data = blob.data
18
+
19
+ @heuristics.each do |heuristic|
20
+ return Array(heuristic.call(data)) if heuristic.matches?(languages)
43
21
  end
22
+
23
+ [] # No heuristics matched
44
24
  end
45
25
 
46
- # .h extensions are ambiguous between C, C++, and Objective-C.
47
- # We want to shortcut look for Objective-C _and_ now C++ too!
26
+ # Internal: Define a new heuristic.
48
27
  #
49
- # Returns an array of Languages or []
50
- def self.disambiguate_c(data)
51
- matches = []
52
- if data.include?("@interface")
53
- matches << Language["Objective-C"]
54
- elsif data.include?("#include <cstdint>")
55
- matches << Language["C++"]
28
+ # languages - String names of languages to disambiguate.
29
+ # heuristic - Block which takes data as an argument and returns a Language or nil.
30
+ #
31
+ # Examples
32
+ #
33
+ # disambiguate "Perl", "Prolog" do |data|
34
+ # if data.include?("use strict")
35
+ # Language["Perl"]
36
+ # elsif data.include?(":-")
37
+ # Language["Prolog"]
38
+ # end
39
+ # end
40
+ #
41
+ def self.disambiguate(*languages, &heuristic)
42
+ @heuristics << new(languages, &heuristic)
43
+ end
44
+
45
+ # Internal: Array of defined heuristics
46
+ @heuristics = []
47
+
48
+ # Internal
49
+ def initialize(languages, &heuristic)
50
+ @languages = languages
51
+ @heuristic = heuristic
52
+ end
53
+
54
+ # Internal: Check if this heuristic matches the candidate languages.
55
+ def matches?(candidates)
56
+ candidates.all? { |l| @languages.include?(l.name) }
57
+ end
58
+
59
+ # Internal: Perform the heuristic
60
+ def call(data)
61
+ @heuristic.call(data)
62
+ end
63
+
64
+ disambiguate "Objective-C", "C++", "C" do |data|
65
+ if (/@(interface|class|protocol|property|end|synchronised|selector|implementation)\b/.match(data))
66
+ Language["Objective-C"]
67
+ elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
68
+ /^\s*template\s*</.match(data) || /^[^@]class\s+\w+/.match(data) || /^[^@](private|public|protected):$/.match(data) || /std::.+$/.match(data))
69
+ Language["C++"]
56
70
  end
57
- matches
58
71
  end
59
72
 
60
- def self.disambiguate_pl(data)
61
- matches = []
62
- if data.include?("use strict")
63
- matches << Language["Perl"]
73
+ disambiguate "Perl", "Perl6", "Prolog" do |data|
74
+ if data.include?("use v6")
75
+ Language["Perl6"]
76
+ elsif data.include?("use strict")
77
+ Language["Perl"]
64
78
  elsif data.include?(":-")
65
- matches << Language["Prolog"]
79
+ Language["Prolog"]
66
80
  end
67
- matches
68
81
  end
69
82
 
70
- def self.disambiguate_ecl(data)
71
- matches = []
83
+ disambiguate "ECL", "Prolog" do |data|
72
84
  if data.include?(":-")
73
- matches << Language["Prolog"]
85
+ Language["Prolog"]
74
86
  elsif data.include?(":=")
75
- matches << Language["ECL"]
87
+ Language["ECL"]
76
88
  end
77
- matches
78
89
  end
79
90
 
80
- def self.disambiguate_pro(data)
81
- matches = []
82
- if (data.include?(":-"))
83
- matches << Language["Prolog"]
84
- else
85
- matches << Language["IDL"]
86
- end
87
- matches
88
- end
89
-
90
- def self.disambiguate_ts(data)
91
- matches = []
92
- if (data.include?("</translation>"))
93
- matches << Language["XML"]
91
+ disambiguate "IDL", "Prolog" do |data|
92
+ if data.include?(":-")
93
+ Language["Prolog"]
94
94
  else
95
- matches << Language["TypeScript"]
95
+ Language["IDL"]
96
96
  end
97
- matches
98
97
  end
99
98
 
100
- def self.disambiguate_cl(data)
101
- matches = []
99
+ disambiguate "Common Lisp", "OpenCL", "Cool" do |data|
102
100
  if data.include?("(defun ")
103
- matches << Language["Common Lisp"]
101
+ Language["Common Lisp"]
102
+ elsif /^class/x.match(data)
103
+ Language["Cool"]
104
104
  elsif /\/\* |\/\/ |^\}/.match(data)
105
- matches << Language["OpenCL"]
105
+ Language["OpenCL"]
106
106
  end
107
- matches
108
- end
109
-
110
- def self.disambiguate_r(data)
111
- matches = []
112
- matches << Language["Rebol"] if /\bRebol\b/i.match(data)
113
- matches << Language["R"] if data.include?("<-")
114
- matches
115
107
  end
116
108
 
117
- def self.disambiguate_hack(data)
118
- matches = []
109
+ disambiguate "Hack", "PHP" do |data|
119
110
  if data.include?("<?hh")
120
- matches << Language["Hack"]
111
+ Language["Hack"]
121
112
  elsif /<?[^h]/.match(data)
122
- matches << Language["PHP"]
113
+ Language["PHP"]
123
114
  end
124
- matches
125
115
  end
126
116
 
127
- def self.disambiguate_sc(data)
128
- matches = []
129
- if (/\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data))
130
- matches << Language["SuperCollider"]
131
- end
132
- if (/^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data))
133
- matches << Language["Scala"]
117
+ disambiguate "Scala", "SuperCollider" do |data|
118
+ if /\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data)
119
+ Language["SuperCollider"]
120
+ elsif /^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data)
121
+ Language["Scala"]
134
122
  end
135
- matches
136
123
  end
137
124
 
138
- def self.disambiguate_asc(data)
139
- matches = []
140
- matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
141
- matches
125
+ disambiguate "AsciiDoc", "AGS Script" do |data|
126
+ Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
142
127
  end
143
128
 
144
- def self.disambiguate_f(data)
145
- matches = []
129
+ disambiguate "FORTRAN", "Forth" do |data|
146
130
  if /^: /.match(data)
147
- matches << Language["Forth"]
131
+ Language["Forth"]
148
132
  elsif /^([c*][^a-z]| subroutine\s)/i.match(data)
149
- matches << Language["FORTRAN"]
133
+ Language["FORTRAN"]
150
134
  end
151
- matches
152
135
  end
153
136
 
154
- def self.active?
155
- !!ACTIVE
137
+ disambiguate "F#", "Forth", "GLSL" do |data|
138
+ if /^(: |new-device)/.match(data)
139
+ Language["Forth"]
140
+ elsif /^(#light|import|let|module|namespace|open|type)/.match(data)
141
+ Language["F#"]
142
+ elsif /^(#include|#pragma|precision|uniform|varying|void)/.match(data)
143
+ Language["GLSL"]
144
+ end
145
+ end
146
+
147
+ disambiguate "Gosu", "JavaScript" do |data|
148
+ Language["Gosu"] if /^uses java\./.match(data)
149
+ end
150
+
151
+ disambiguate "LoomScript", "LiveScript" do |data|
152
+ if /^\s*package\s*[\w\.\/\*\s]*\s*{/.match(data)
153
+ Language["LoomScript"]
154
+ else
155
+ Language["LiveScript"]
156
+ end
156
157
  end
158
+
157
159
  end
158
160
  end
@@ -10,6 +10,8 @@ require 'linguist/heuristics'
10
10
  require 'linguist/samples'
11
11
  require 'linguist/file_blob'
12
12
  require 'linguist/blob_helper'
13
+ require 'linguist/strategy/filename'
14
+ require 'linguist/shebang'
13
15
 
14
16
  module Linguist
15
17
  # Language names that are recognizable by GitHub. Defined languages
@@ -91,6 +93,13 @@ module Linguist
91
93
  language
92
94
  end
93
95
 
96
+ STRATEGIES = [
97
+ Linguist::Strategy::Filename,
98
+ Linguist::Shebang,
99
+ Linguist::Heuristics,
100
+ Linguist::Classifier
101
+ ]
102
+
94
103
  # Public: Detects the Language of the blob.
95
104
  #
96
105
  # blob - an object that includes the Linguist `BlobHelper` interface;
@@ -98,49 +107,22 @@ module Linguist
98
107
  #
99
108
  # Returns Language or nil.
100
109
  def self.detect(blob)
101
- name = blob.name.to_s
102
-
103
110
  # Bail early if the blob is binary or empty.
104
111
  return nil if blob.likely_binary? || blob.binary? || blob.empty?
105
112
 
106
- # A bit of an elegant hack. If the file is executable but extensionless,
107
- # append a "magic" extension so it can be classified with other
108
- # languages that have shebang scripts.
109
- extension = FileBlob.new(name).extension
110
- if extension.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05
111
- name += ".script!"
112
- end
113
-
114
- # First try to find languages that match based on filename.
115
- possible_languages = find_by_filename(name)
116
-
117
- # If there is more than one possible language with that extension (or no
118
- # extension at all, in the case of extensionless scripts), we need to continue
119
- # our detection work
120
- if possible_languages.length > 1
121
- data = blob.data
122
- possible_language_names = possible_languages.map(&:name)
123
- heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
124
-
125
- if heuristic_languages.size > 1
126
- possible_language_names = heuristic_languages.map(&:name)
127
- end
128
-
129
- # Check if there's a shebang line and use that as authoritative
130
- if (result = find_by_shebang(data)) && !result.empty?
131
- result.first
132
- # No shebang. Still more work to do. Try to find it with our heuristics.
133
- elsif heuristic_languages.size == 1
134
- heuristic_languages.first
135
- # Lastly, fall back to the probabilistic classifier.
136
- elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
137
- # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
138
- Language[classified[0]]
113
+ # Call each strategy until one candidate is returned.
114
+ STRATEGIES.reduce([]) do |languages, strategy|
115
+ candidates = strategy.call(blob, languages)
116
+ if candidates.size == 1
117
+ return candidates.first
118
+ elsif candidates.size > 1
119
+ # More than one candidate was found, pass them to the next strategy.
120
+ candidates
121
+ else
122
+ # No candiates were found, pass on languages from the previous strategy.
123
+ languages
139
124
  end
140
- else
141
- # Simplest and most common case, we can just return the one match based on extension
142
- possible_languages.first
143
- end
125
+ end.first
144
126
  end
145
127
 
146
128
  # Public: Get all Languages
@@ -190,8 +172,13 @@ module Linguist
190
172
  # Returns all matching Languages or [] if none were found.
191
173
  def self.find_by_filename(filename)
192
174
  basename = File.basename(filename)
193
- extname = FileBlob.new(filename).extension
194
- (@filename_index[basename] + find_by_extension(extname)).compact.uniq
175
+
176
+ # find the first extension with language definitions
177
+ extname = FileBlob.new(filename).extensions.detect do |e|
178
+ !@extension_index[e].empty?
179
+ end
180
+
181
+ (@filename_index[basename] + @extension_index[extname]).compact.uniq
195
182
  end
196
183
 
197
184
  # Public: Look up Languages by file extension.
@@ -212,20 +199,26 @@ module Linguist
212
199
  @extension_index[extname]
213
200
  end
214
201
 
215
- # Public: Look up Languages by shebang line.
202
+ # DEPRECATED
203
+ def self.find_by_shebang(data)
204
+ @interpreter_index[Shebang.interpreter(data)]
205
+ end
206
+
207
+ # Public: Look up Languages by interpreter.
216
208
  #
217
- # data - Array of tokens or String data to analyze.
209
+ # interpreter - String of interpreter name
218
210
  #
219
211
  # Examples
220
212
  #
221
- # Language.find_by_shebang("#!/bin/bash\ndate;")
213
+ # Language.find_by_interpreter("bash")
222
214
  # # => [#<Language name="Bash">]
223
215
  #
224
216
  # Returns the matching Language
225
- def self.find_by_shebang(data)
226
- @interpreter_index[Linguist.interpreter_from_shebang(data)]
217
+ def self.find_by_interpreter(interpreter)
218
+ @interpreter_index[interpreter]
227
219
  end
228
220
 
221
+
229
222
  # Public: Look up Language by its name or lexer.
230
223
  #
231
224
  # name - The String name of the Language