github-linguist 4.0.3 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c83f59f5d07b0f9d0b233205ad068fdce8977d9d
4
- data.tar.gz: 255fb8be50d94c14dde66750ebdc262a52538413
3
+ metadata.gz: 4fd8402379e8ac3de17921cf9831e6db303bea33
4
+ data.tar.gz: 546482b4f73f6c6a512b0258e0768aeb51fdbfb1
5
5
  SHA512:
6
- metadata.gz: f3d65aaf387f84f956dffd975d2f5e9869f3f2631a429b1d4819525ffdf2816783a7324d9b66821dedd7080c9f9718b58201fad0d04747fe3a83136151ef547a
7
- data.tar.gz: c1f9d19147a6e1ae946dd75222f45ed72e59b0643953c3365c2547a2f7d6847639176aba6c14c69a00cfcca0cd61794d63afefdcc773395efca4c7f2112e533d
6
+ metadata.gz: 73d9c3e80b0884a3ef96f5281028bf943f183de4f11aecd5ef31b986fb8f13d4fa70a38a475a0ca6ba0e3aa20cd3b8965edd41e5f26f15e35ae375c30974382b
7
+ data.tar.gz: 18d6610244f861a7c4f925b1332370b39901fb7f640efba8b5a83ca7f80321aa17cf5757dd532b7c54206b1c2b66e99c00aafe2213aefcd695205af7afc01bff
data/lib/linguist.rb CHANGED
@@ -4,4 +4,5 @@ require 'linguist/heuristics'
4
4
  require 'linguist/language'
5
5
  require 'linguist/repository'
6
6
  require 'linguist/samples'
7
+ require 'linguist/shebang'
7
8
  require 'linguist/version'
@@ -3,6 +3,25 @@ require 'linguist/tokenizer'
3
3
  module Linguist
4
4
  # Language bayesian classifier.
5
5
  class Classifier
6
+ # Public: Use the classifier to detect language of the blob.
7
+ #
8
+ # blob - An object that quacks like a blob.
9
+ # possible_languages - Array of Language objects
10
+ #
11
+ # Examples
12
+ #
13
+ # Classifier.call(FileBlob.new("path/to/file"), [
14
+ # Language["Ruby"], Language["Python"]
15
+ # ])
16
+ #
17
+ # Returns an Array of Language objects, most probable first.
18
+ def self.call(blob, possible_languages)
19
+ language_names = possible_languages.map(&:name)
20
+ classify(Samples.cache, blob.data, language_names).map do |name, _|
21
+ Language[name] # Return the actual Language objects
22
+ end
23
+ end
24
+
6
25
  # Public: Train classifier that data is a certain language.
7
26
  #
8
27
  # db - Hash classifier database object
@@ -57,14 +57,20 @@ module Linguist
57
57
  #
58
58
  # Returns a String.
59
59
  def extension
60
- # File.extname returns nil if the filename is an extension.
61
- extension = File.extname(name)
62
- basename = File.basename(name)
63
- # Checks if the filename is an extension.
64
- if extension.empty? && basename[0] == "."
65
- basename
66
- else
67
- extension
60
+ extensions.last || ""
61
+ end
62
+
63
+ # Public: Return an array of the file extensions
64
+ #
65
+ # >> Linguist::FileBlob.new("app/views/things/index.html.erb").extensions
66
+ # => [".html.erb", ".erb"]
67
+ #
68
+ # Returns an Array
69
+ def extensions
70
+ basename, *segments = File.basename(name).split(".")
71
+
72
+ segments.map.with_index do |segment, index|
73
+ "." + segments[index..-1].join(".")
68
74
  end
69
75
  end
70
76
  end
@@ -1,158 +1,160 @@
1
1
  module Linguist
2
2
  # A collection of simple heuristics that can be used to better analyze languages.
3
3
  class Heuristics
4
- ACTIVE = true
5
-
6
- # Public: Given an array of String language names,
7
- # apply heuristics against the given data and return an array
8
- # of matching languages, or nil.
4
+ # Public: Use heuristics to detect language of the blob.
5
+ #
6
+ # blob - An object that quacks like a blob.
7
+ # possible_languages - Array of Language objects
8
+ #
9
+ # Examples
9
10
  #
10
- # data - Array of tokens or String data to analyze.
11
- # languages - Array of language name Strings to restrict to.
11
+ # Heuristics.call(FileBlob.new("path/to/file"), [
12
+ # Language["Ruby"], Language["Python"]
13
+ # ])
12
14
  #
13
- # Returns an array of Languages or []
14
- def self.find_by_heuristics(data, languages)
15
- if active?
16
- result = []
17
-
18
- if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
19
- result = disambiguate_pl(data)
20
- end
21
- if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
22
- result = disambiguate_ecl(data)
23
- end
24
- if languages.all? { |l| ["IDL", "Prolog"].include?(l) }
25
- result = disambiguate_pro(data)
26
- end
27
- if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
28
- result = disambiguate_cl(data)
29
- end
30
- if languages.all? { |l| ["Hack", "PHP"].include?(l) }
31
- result = disambiguate_hack(data)
32
- end
33
- if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) }
34
- result = disambiguate_sc(data)
35
- end
36
- if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) }
37
- result = disambiguate_asc(data)
38
- end
39
- if languages.all? { |l| ["FORTRAN", "Forth"].include?(l) }
40
- result = disambiguate_f(data)
41
- end
42
- return result
15
+ # Returns an Array of languages, or empty if none matched or were inconclusive.
16
+ def self.call(blob, languages)
17
+ data = blob.data
18
+
19
+ @heuristics.each do |heuristic|
20
+ return Array(heuristic.call(data)) if heuristic.matches?(languages)
43
21
  end
22
+
23
+ [] # No heuristics matched
44
24
  end
45
25
 
46
- # .h extensions are ambiguous between C, C++, and Objective-C.
47
- # We want to shortcut look for Objective-C _and_ now C++ too!
26
+ # Internal: Define a new heuristic.
48
27
  #
49
- # Returns an array of Languages or []
50
- def self.disambiguate_c(data)
51
- matches = []
52
- if data.include?("@interface")
53
- matches << Language["Objective-C"]
54
- elsif data.include?("#include <cstdint>")
55
- matches << Language["C++"]
28
+ # languages - String names of languages to disambiguate.
29
+ # heuristic - Block which takes data as an argument and returns a Language or nil.
30
+ #
31
+ # Examples
32
+ #
33
+ # disambiguate "Perl", "Prolog" do |data|
34
+ # if data.include?("use strict")
35
+ # Language["Perl"]
36
+ # elsif data.include?(":-")
37
+ # Language["Prolog"]
38
+ # end
39
+ # end
40
+ #
41
+ def self.disambiguate(*languages, &heuristic)
42
+ @heuristics << new(languages, &heuristic)
43
+ end
44
+
45
+ # Internal: Array of defined heuristics
46
+ @heuristics = []
47
+
48
+ # Internal
49
+ def initialize(languages, &heuristic)
50
+ @languages = languages
51
+ @heuristic = heuristic
52
+ end
53
+
54
+ # Internal: Check if this heuristic matches the candidate languages.
55
+ def matches?(candidates)
56
+ candidates.all? { |l| @languages.include?(l.name) }
57
+ end
58
+
59
+ # Internal: Perform the heuristic
60
+ def call(data)
61
+ @heuristic.call(data)
62
+ end
63
+
64
+ disambiguate "Objective-C", "C++", "C" do |data|
65
+ if (/@(interface|class|protocol|property|end|synchronised|selector|implementation)\b/.match(data))
66
+ Language["Objective-C"]
67
+ elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
68
+ /^\s*template\s*</.match(data) || /^[^@]class\s+\w+/.match(data) || /^[^@](private|public|protected):$/.match(data) || /std::.+$/.match(data))
69
+ Language["C++"]
56
70
  end
57
- matches
58
71
  end
59
72
 
60
- def self.disambiguate_pl(data)
61
- matches = []
62
- if data.include?("use strict")
63
- matches << Language["Perl"]
73
+ disambiguate "Perl", "Perl6", "Prolog" do |data|
74
+ if data.include?("use v6")
75
+ Language["Perl6"]
76
+ elsif data.include?("use strict")
77
+ Language["Perl"]
64
78
  elsif data.include?(":-")
65
- matches << Language["Prolog"]
79
+ Language["Prolog"]
66
80
  end
67
- matches
68
81
  end
69
82
 
70
- def self.disambiguate_ecl(data)
71
- matches = []
83
+ disambiguate "ECL", "Prolog" do |data|
72
84
  if data.include?(":-")
73
- matches << Language["Prolog"]
85
+ Language["Prolog"]
74
86
  elsif data.include?(":=")
75
- matches << Language["ECL"]
87
+ Language["ECL"]
76
88
  end
77
- matches
78
89
  end
79
90
 
80
- def self.disambiguate_pro(data)
81
- matches = []
82
- if (data.include?(":-"))
83
- matches << Language["Prolog"]
84
- else
85
- matches << Language["IDL"]
86
- end
87
- matches
88
- end
89
-
90
- def self.disambiguate_ts(data)
91
- matches = []
92
- if (data.include?("</translation>"))
93
- matches << Language["XML"]
91
+ disambiguate "IDL", "Prolog" do |data|
92
+ if data.include?(":-")
93
+ Language["Prolog"]
94
94
  else
95
- matches << Language["TypeScript"]
95
+ Language["IDL"]
96
96
  end
97
- matches
98
97
  end
99
98
 
100
- def self.disambiguate_cl(data)
101
- matches = []
99
+ disambiguate "Common Lisp", "OpenCL", "Cool" do |data|
102
100
  if data.include?("(defun ")
103
- matches << Language["Common Lisp"]
101
+ Language["Common Lisp"]
102
+ elsif /^class/x.match(data)
103
+ Language["Cool"]
104
104
  elsif /\/\* |\/\/ |^\}/.match(data)
105
- matches << Language["OpenCL"]
105
+ Language["OpenCL"]
106
106
  end
107
- matches
108
- end
109
-
110
- def self.disambiguate_r(data)
111
- matches = []
112
- matches << Language["Rebol"] if /\bRebol\b/i.match(data)
113
- matches << Language["R"] if data.include?("<-")
114
- matches
115
107
  end
116
108
 
117
- def self.disambiguate_hack(data)
118
- matches = []
109
+ disambiguate "Hack", "PHP" do |data|
119
110
  if data.include?("<?hh")
120
- matches << Language["Hack"]
111
+ Language["Hack"]
121
112
  elsif /<?[^h]/.match(data)
122
- matches << Language["PHP"]
113
+ Language["PHP"]
123
114
  end
124
- matches
125
115
  end
126
116
 
127
- def self.disambiguate_sc(data)
128
- matches = []
129
- if (/\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data))
130
- matches << Language["SuperCollider"]
131
- end
132
- if (/^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data))
133
- matches << Language["Scala"]
117
+ disambiguate "Scala", "SuperCollider" do |data|
118
+ if /\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data)
119
+ Language["SuperCollider"]
120
+ elsif /^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data)
121
+ Language["Scala"]
134
122
  end
135
- matches
136
123
  end
137
124
 
138
- def self.disambiguate_asc(data)
139
- matches = []
140
- matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
141
- matches
125
+ disambiguate "AsciiDoc", "AGS Script" do |data|
126
+ Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
142
127
  end
143
128
 
144
- def self.disambiguate_f(data)
145
- matches = []
129
+ disambiguate "FORTRAN", "Forth" do |data|
146
130
  if /^: /.match(data)
147
- matches << Language["Forth"]
131
+ Language["Forth"]
148
132
  elsif /^([c*][^a-z]| subroutine\s)/i.match(data)
149
- matches << Language["FORTRAN"]
133
+ Language["FORTRAN"]
150
134
  end
151
- matches
152
135
  end
153
136
 
154
- def self.active?
155
- !!ACTIVE
137
+ disambiguate "F#", "Forth", "GLSL" do |data|
138
+ if /^(: |new-device)/.match(data)
139
+ Language["Forth"]
140
+ elsif /^(#light|import|let|module|namespace|open|type)/.match(data)
141
+ Language["F#"]
142
+ elsif /^(#include|#pragma|precision|uniform|varying|void)/.match(data)
143
+ Language["GLSL"]
144
+ end
145
+ end
146
+
147
+ disambiguate "Gosu", "JavaScript" do |data|
148
+ Language["Gosu"] if /^uses java\./.match(data)
149
+ end
150
+
151
+ disambiguate "LoomScript", "LiveScript" do |data|
152
+ if /^\s*package\s*[\w\.\/\*\s]*\s*{/.match(data)
153
+ Language["LoomScript"]
154
+ else
155
+ Language["LiveScript"]
156
+ end
156
157
  end
158
+
157
159
  end
158
160
  end
@@ -10,6 +10,8 @@ require 'linguist/heuristics'
10
10
  require 'linguist/samples'
11
11
  require 'linguist/file_blob'
12
12
  require 'linguist/blob_helper'
13
+ require 'linguist/strategy/filename'
14
+ require 'linguist/shebang'
13
15
 
14
16
  module Linguist
15
17
  # Language names that are recognizable by GitHub. Defined languages
@@ -91,6 +93,13 @@ module Linguist
91
93
  language
92
94
  end
93
95
 
96
+ STRATEGIES = [
97
+ Linguist::Strategy::Filename,
98
+ Linguist::Shebang,
99
+ Linguist::Heuristics,
100
+ Linguist::Classifier
101
+ ]
102
+
94
103
  # Public: Detects the Language of the blob.
95
104
  #
96
105
  # blob - an object that includes the Linguist `BlobHelper` interface;
@@ -98,49 +107,22 @@ module Linguist
98
107
  #
99
108
  # Returns Language or nil.
100
109
  def self.detect(blob)
101
- name = blob.name.to_s
102
-
103
110
  # Bail early if the blob is binary or empty.
104
111
  return nil if blob.likely_binary? || blob.binary? || blob.empty?
105
112
 
106
- # A bit of an elegant hack. If the file is executable but extensionless,
107
- # append a "magic" extension so it can be classified with other
108
- # languages that have shebang scripts.
109
- extension = FileBlob.new(name).extension
110
- if extension.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05
111
- name += ".script!"
112
- end
113
-
114
- # First try to find languages that match based on filename.
115
- possible_languages = find_by_filename(name)
116
-
117
- # If there is more than one possible language with that extension (or no
118
- # extension at all, in the case of extensionless scripts), we need to continue
119
- # our detection work
120
- if possible_languages.length > 1
121
- data = blob.data
122
- possible_language_names = possible_languages.map(&:name)
123
- heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
124
-
125
- if heuristic_languages.size > 1
126
- possible_language_names = heuristic_languages.map(&:name)
127
- end
128
-
129
- # Check if there's a shebang line and use that as authoritative
130
- if (result = find_by_shebang(data)) && !result.empty?
131
- result.first
132
- # No shebang. Still more work to do. Try to find it with our heuristics.
133
- elsif heuristic_languages.size == 1
134
- heuristic_languages.first
135
- # Lastly, fall back to the probabilistic classifier.
136
- elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
137
- # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
138
- Language[classified[0]]
113
+ # Call each strategy until one candidate is returned.
114
+ STRATEGIES.reduce([]) do |languages, strategy|
115
+ candidates = strategy.call(blob, languages)
116
+ if candidates.size == 1
117
+ return candidates.first
118
+ elsif candidates.size > 1
119
+ # More than one candidate was found, pass them to the next strategy.
120
+ candidates
121
+ else
122
+ # No candiates were found, pass on languages from the previous strategy.
123
+ languages
139
124
  end
140
- else
141
- # Simplest and most common case, we can just return the one match based on extension
142
- possible_languages.first
143
- end
125
+ end.first
144
126
  end
145
127
 
146
128
  # Public: Get all Languages
@@ -190,8 +172,13 @@ module Linguist
190
172
  # Returns all matching Languages or [] if none were found.
191
173
  def self.find_by_filename(filename)
192
174
  basename = File.basename(filename)
193
- extname = FileBlob.new(filename).extension
194
- (@filename_index[basename] + find_by_extension(extname)).compact.uniq
175
+
176
+ # find the first extension with language definitions
177
+ extname = FileBlob.new(filename).extensions.detect do |e|
178
+ !@extension_index[e].empty?
179
+ end
180
+
181
+ (@filename_index[basename] + @extension_index[extname]).compact.uniq
195
182
  end
196
183
 
197
184
  # Public: Look up Languages by file extension.
@@ -212,20 +199,26 @@ module Linguist
212
199
  @extension_index[extname]
213
200
  end
214
201
 
215
- # Public: Look up Languages by shebang line.
202
+ # DEPRECATED
203
+ def self.find_by_shebang(data)
204
+ @interpreter_index[Shebang.interpreter(data)]
205
+ end
206
+
207
+ # Public: Look up Languages by interpreter.
216
208
  #
217
- # data - Array of tokens or String data to analyze.
209
+ # interpreter - String of interpreter name
218
210
  #
219
211
  # Examples
220
212
  #
221
- # Language.find_by_shebang("#!/bin/bash\ndate;")
213
+ # Language.find_by_interpreter("bash")
222
214
  # # => [#<Language name="Bash">]
223
215
  #
224
216
  # Returns the matching Language
225
- def self.find_by_shebang(data)
226
- @interpreter_index[Linguist.interpreter_from_shebang(data)]
217
+ def self.find_by_interpreter(interpreter)
218
+ @interpreter_index[interpreter]
227
219
  end
228
220
 
221
+
229
222
  # Public: Look up Language by its name or lexer.
230
223
  #
231
224
  # name - The String name of the Language