gitlab-linguist 2.9.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/linguist +46 -0
- data/lib/linguist.rb +5 -0
- data/lib/linguist/blob_helper.rb +316 -0
- data/lib/linguist/classifier.rb +171 -0
- data/lib/linguist/file_blob.rb +56 -0
- data/lib/linguist/generated.rb +185 -0
- data/lib/linguist/language.rb +495 -0
- data/lib/linguist/languages.yml +1585 -0
- data/lib/linguist/md5.rb +38 -0
- data/lib/linguist/popular.yml +29 -0
- data/lib/linguist/repository.rb +95 -0
- data/lib/linguist/samples.json +41457 -0
- data/lib/linguist/samples.rb +98 -0
- data/lib/linguist/tokenizer.rb +198 -0
- data/lib/linguist/vendor.yml +129 -0
- metadata +171 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1f61bbc6a1106207f7c4791dc3c4bcd83600fa59
|
4
|
+
data.tar.gz: 1433a3391e6247ba26603ba25a7028f3fea9a45f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9902468506da9cc6e5a8ddf684001c7e117a0285717aedb625cea00c7ba19f6689131fadd8a10e19cbe02bc666fd1e02bc49afe75b8e37fd8ce95184b43e6e61
|
7
|
+
data.tar.gz: 18eb029e57495598de5b8c9b8d9d630b9160d3c0b2c8d0db4ac212924d0f6952726ee9134a0288ae8a4fec4be529899fac88fa68d39c319ac19a60a3864ee720
|
data/bin/linguist
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# linguist — detect language type for a file, or, given a directory, determine language breakdown
|
4
|
+
#
|
5
|
+
# usage: linguist <path>
|
6
|
+
|
7
|
+
require 'linguist/file_blob'
|
8
|
+
require 'linguist/repository'
|
9
|
+
|
10
|
+
path = ARGV[0] || Dir.pwd
|
11
|
+
|
12
|
+
if File.directory?(path)
|
13
|
+
repo = Linguist::Repository.from_directory(path)
|
14
|
+
repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
|
15
|
+
percentage = ((size / repo.size.to_f) * 100).round
|
16
|
+
puts "%-4s %s" % ["#{percentage}%", language]
|
17
|
+
end
|
18
|
+
elsif File.file?(path)
|
19
|
+
blob = Linguist::FileBlob.new(path, Dir.pwd)
|
20
|
+
type = if blob.text?
|
21
|
+
'Text'
|
22
|
+
elsif blob.image?
|
23
|
+
'Image'
|
24
|
+
else
|
25
|
+
'Binary'
|
26
|
+
end
|
27
|
+
|
28
|
+
puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
|
29
|
+
puts " type: #{type}"
|
30
|
+
puts " mime type: #{blob.mime_type}"
|
31
|
+
puts " language: #{blob.language}"
|
32
|
+
|
33
|
+
if blob.large?
|
34
|
+
puts " blob is too large to be shown"
|
35
|
+
end
|
36
|
+
|
37
|
+
if blob.generated?
|
38
|
+
puts " appears to be generated source code"
|
39
|
+
end
|
40
|
+
|
41
|
+
if blob.vendored?
|
42
|
+
puts " appears to be a vendored file"
|
43
|
+
end
|
44
|
+
else
|
45
|
+
abort "usage: linguist <path>"
|
46
|
+
end
|
data/lib/linguist.rb
ADDED
@@ -0,0 +1,316 @@
|
|
1
|
+
require 'linguist/generated'
|
2
|
+
require 'linguist/language'
|
3
|
+
|
4
|
+
require 'charlock_holmes'
|
5
|
+
require 'escape_utils'
|
6
|
+
require 'mime/types'
|
7
|
+
require 'pygments'
|
8
|
+
require 'yaml'
|
9
|
+
|
10
|
+
module Linguist
|
11
|
+
# DEPRECATED Avoid mixing into Blob classes. Prefer functional interfaces
|
12
|
+
# like `Language.detect` over `Blob#language`. Functions are much easier to
|
13
|
+
# cache and compose.
|
14
|
+
#
|
15
|
+
# Avoid adding additional bloat to this module.
|
16
|
+
#
|
17
|
+
# BlobHelper is a mixin for Blobish classes that respond to "name",
|
18
|
+
# "data" and "size" such as Grit::Blob.
|
19
|
+
module BlobHelper
|
20
|
+
# Public: Get the extname of the path
|
21
|
+
#
|
22
|
+
# Examples
|
23
|
+
#
|
24
|
+
# blob(name='foo.rb').extname
|
25
|
+
# # => '.rb'
|
26
|
+
#
|
27
|
+
# Returns a String
|
28
|
+
def extname
|
29
|
+
File.extname(name.to_s)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Internal: Lookup mime type for extension.
|
33
|
+
#
|
34
|
+
# Returns a MIME::Type
|
35
|
+
def _mime_type
|
36
|
+
if defined? @_mime_type
|
37
|
+
@_mime_type
|
38
|
+
else
|
39
|
+
guesses = ::MIME::Types.type_for(extname.to_s)
|
40
|
+
|
41
|
+
# Prefer text mime types over binary
|
42
|
+
@_mime_type = guesses.detect { |type| type.ascii? } ||
|
43
|
+
# Otherwise use the first guess
|
44
|
+
guesses.first
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Public: Get the actual blob mime type
|
49
|
+
#
|
50
|
+
# Examples
|
51
|
+
#
|
52
|
+
# # => 'text/plain'
|
53
|
+
# # => 'text/html'
|
54
|
+
#
|
55
|
+
# Returns a mime type String.
|
56
|
+
def mime_type
|
57
|
+
_mime_type ? _mime_type.to_s : 'text/plain'
|
58
|
+
end
|
59
|
+
|
60
|
+
# Internal: Is the blob binary according to its mime type
|
61
|
+
#
|
62
|
+
# Return true or false
|
63
|
+
def binary_mime_type?
|
64
|
+
_mime_type ? _mime_type.binary? : false
|
65
|
+
end
|
66
|
+
|
67
|
+
# Internal: Is the blob binary according to its mime type,
|
68
|
+
# overriding it if we have better data from the languages.yml
|
69
|
+
# database.
|
70
|
+
#
|
71
|
+
# Return true or false
|
72
|
+
def likely_binary?
|
73
|
+
binary_mime_type? && !Language.find_by_filename(name)
|
74
|
+
end
|
75
|
+
|
76
|
+
# Public: Get the Content-Type header value
|
77
|
+
#
|
78
|
+
# This value is used when serving raw blobs.
|
79
|
+
#
|
80
|
+
# Examples
|
81
|
+
#
|
82
|
+
# # => 'text/plain; charset=utf-8'
|
83
|
+
# # => 'application/octet-stream'
|
84
|
+
#
|
85
|
+
# Returns a content type String.
|
86
|
+
def content_type
|
87
|
+
@content_type ||= (binary_mime_type? || binary?) ? mime_type :
|
88
|
+
(encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
|
89
|
+
end
|
90
|
+
|
91
|
+
# Public: Get the Content-Disposition header value
|
92
|
+
#
|
93
|
+
# This value is used when serving raw blobs.
|
94
|
+
#
|
95
|
+
# # => "attachment; filename=file.tar"
|
96
|
+
# # => "inline"
|
97
|
+
#
|
98
|
+
# Returns a content disposition String.
|
99
|
+
def disposition
|
100
|
+
if text? || image?
|
101
|
+
'inline'
|
102
|
+
elsif name.nil?
|
103
|
+
"attachment"
|
104
|
+
else
|
105
|
+
"attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def encoding
|
110
|
+
if hash = detect_encoding
|
111
|
+
hash[:encoding]
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# Try to guess the encoding
|
116
|
+
#
|
117
|
+
# Returns: a Hash, with :encoding, :confidence, :type
|
118
|
+
# this will return nil if an error occurred during detection or
|
119
|
+
# no valid encoding could be found
|
120
|
+
def detect_encoding
|
121
|
+
@detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
|
122
|
+
end
|
123
|
+
|
124
|
+
# Public: Is the blob binary?
|
125
|
+
#
|
126
|
+
# Return true or false
|
127
|
+
def binary?
|
128
|
+
# Large blobs aren't even loaded into memory
|
129
|
+
if data.nil?
|
130
|
+
true
|
131
|
+
|
132
|
+
# Treat blank files as text
|
133
|
+
elsif data == ""
|
134
|
+
false
|
135
|
+
|
136
|
+
# Charlock doesn't know what to think
|
137
|
+
elsif encoding.nil?
|
138
|
+
true
|
139
|
+
|
140
|
+
# If Charlock says its binary
|
141
|
+
else
|
142
|
+
detect_encoding[:type] == :binary
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
# Public: Is the blob text?
|
147
|
+
#
|
148
|
+
# Return true or false
|
149
|
+
def text?
|
150
|
+
!binary?
|
151
|
+
end
|
152
|
+
|
153
|
+
# Public: Is the blob a supported image format?
|
154
|
+
#
|
155
|
+
# Return true or false
|
156
|
+
def image?
|
157
|
+
['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
|
158
|
+
end
|
159
|
+
|
160
|
+
# Public: Is the blob a supported 3D model format?
|
161
|
+
#
|
162
|
+
# Return true or false
|
163
|
+
def solid?
|
164
|
+
extname.downcase == '.stl'
|
165
|
+
end
|
166
|
+
|
167
|
+
# Public: Is this blob a CSV file?
|
168
|
+
#
|
169
|
+
# Return true or false
|
170
|
+
def csv?
|
171
|
+
text? && extname.downcase == '.csv'
|
172
|
+
end
|
173
|
+
|
174
|
+
# Public: Is the blob a PDF?
|
175
|
+
#
|
176
|
+
# Return true or false
|
177
|
+
def pdf?
|
178
|
+
extname.downcase == '.pdf'
|
179
|
+
end
|
180
|
+
|
181
|
+
MEGABYTE = 1024 * 1024
|
182
|
+
|
183
|
+
# Public: Is the blob too big to load?
|
184
|
+
#
|
185
|
+
# Return true or false
|
186
|
+
def large?
|
187
|
+
size.to_i > MEGABYTE
|
188
|
+
end
|
189
|
+
|
190
|
+
# Public: Is the blob safe to colorize?
|
191
|
+
#
|
192
|
+
# We use Pygments for syntax highlighting blobs. Pygments
|
193
|
+
# can be too slow for very large blobs or for certain
|
194
|
+
# corner-case blobs.
|
195
|
+
#
|
196
|
+
# Return true or false
|
197
|
+
def safe_to_colorize?
|
198
|
+
!large? && text? && !high_ratio_of_long_lines?
|
199
|
+
end
|
200
|
+
|
201
|
+
# Internal: Does the blob have a ratio of long lines?
|
202
|
+
#
|
203
|
+
# These types of files are usually going to make Pygments.rb
|
204
|
+
# angry if we try to colorize them.
|
205
|
+
#
|
206
|
+
# Return true or false
|
207
|
+
def high_ratio_of_long_lines?
|
208
|
+
return false if loc == 0
|
209
|
+
size / loc > 5000
|
210
|
+
end
|
211
|
+
|
212
|
+
# Public: Is the blob viewable?
|
213
|
+
#
|
214
|
+
# Non-viewable blobs will just show a "View Raw" link
|
215
|
+
#
|
216
|
+
# Return true or false
|
217
|
+
def viewable?
|
218
|
+
!large? && text?
|
219
|
+
end
|
220
|
+
|
221
|
+
vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
|
222
|
+
VendoredRegexp = Regexp.new(vendored_paths.join('|'))
|
223
|
+
|
224
|
+
# Public: Is the blob in a vendored directory?
|
225
|
+
#
|
226
|
+
# Vendored files are ignored by language statistics.
|
227
|
+
#
|
228
|
+
# See "vendor.yml" for a list of vendored conventions that match
|
229
|
+
# this pattern.
|
230
|
+
#
|
231
|
+
# Return true or false
|
232
|
+
def vendored?
|
233
|
+
name =~ VendoredRegexp ? true : false
|
234
|
+
end
|
235
|
+
|
236
|
+
# Public: Get each line of data
|
237
|
+
#
|
238
|
+
# Requires Blob#data
|
239
|
+
#
|
240
|
+
# Returns an Array of lines
|
241
|
+
def lines
|
242
|
+
@lines ||=
|
243
|
+
if viewable? && data
|
244
|
+
data.split(/\r\n|\r|\n/, -1)
|
245
|
+
else
|
246
|
+
[]
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
# Public: Get number of lines of code
|
251
|
+
#
|
252
|
+
# Requires Blob#data
|
253
|
+
#
|
254
|
+
# Returns Integer
|
255
|
+
def loc
|
256
|
+
lines.size
|
257
|
+
end
|
258
|
+
|
259
|
+
# Public: Get number of source lines of code
|
260
|
+
#
|
261
|
+
# Requires Blob#data
|
262
|
+
#
|
263
|
+
# Returns Integer
|
264
|
+
def sloc
|
265
|
+
lines.grep(/\S/).size
|
266
|
+
end
|
267
|
+
|
268
|
+
# Public: Is the blob a generated file?
|
269
|
+
#
|
270
|
+
# Generated source code is suppressed in diffs and is ignored by
|
271
|
+
# language statistics.
|
272
|
+
#
|
273
|
+
# May load Blob#data
|
274
|
+
#
|
275
|
+
# Return true or false
|
276
|
+
def generated?
|
277
|
+
@_generated ||= Generated.generated?(name, lambda { data })
|
278
|
+
end
|
279
|
+
|
280
|
+
# Public: Detects the Language of the blob.
|
281
|
+
#
|
282
|
+
# May load Blob#data
|
283
|
+
#
|
284
|
+
# Returns a Language or nil if none is detected
|
285
|
+
def language
|
286
|
+
return @language if defined? @language
|
287
|
+
|
288
|
+
if defined?(@data) && @data.is_a?(String)
|
289
|
+
data = @data
|
290
|
+
else
|
291
|
+
data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
|
292
|
+
end
|
293
|
+
|
294
|
+
@language = Language.detect(name.to_s, data, mode)
|
295
|
+
end
|
296
|
+
|
297
|
+
# Internal: Get the lexer of the blob.
|
298
|
+
#
|
299
|
+
# Returns a Lexer.
|
300
|
+
def lexer
|
301
|
+
language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
|
302
|
+
end
|
303
|
+
|
304
|
+
# Public: Highlight syntax of blob
|
305
|
+
#
|
306
|
+
# options - A Hash of options (defaults to {})
|
307
|
+
#
|
308
|
+
# Returns html String
|
309
|
+
def colorize(options = {})
|
310
|
+
return unless safe_to_colorize?
|
311
|
+
options[:options] ||= {}
|
312
|
+
options[:options][:encoding] ||= encoding
|
313
|
+
lexer.highlight(data, options)
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
require 'linguist/tokenizer'
|
2
|
+
|
3
|
+
module Linguist
|
4
|
+
# Language bayesian classifier.
|
5
|
+
class Classifier
|
6
|
+
# Public: Train classifier that data is a certain language.
|
7
|
+
#
|
8
|
+
# db - Hash classifier database object
|
9
|
+
# language - String language of data
|
10
|
+
# data - String contents of file
|
11
|
+
#
|
12
|
+
# Examples
|
13
|
+
#
|
14
|
+
# Classifier.train(db, 'Ruby', "def hello; end")
|
15
|
+
#
|
16
|
+
# Returns nothing.
|
17
|
+
#
|
18
|
+
# Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token,
|
19
|
+
# per-language. See also dump_all_tokens, below.
|
20
|
+
def self.train!(db, language, data)
|
21
|
+
tokens = Tokenizer.tokenize(data)
|
22
|
+
|
23
|
+
db['tokens_total'] ||= 0
|
24
|
+
db['languages_total'] ||= 0
|
25
|
+
db['tokens'] ||= {}
|
26
|
+
db['language_tokens'] ||= {}
|
27
|
+
db['languages'] ||= {}
|
28
|
+
|
29
|
+
tokens.each do |token|
|
30
|
+
db['tokens'][language] ||= {}
|
31
|
+
db['tokens'][language][token] ||= 0
|
32
|
+
db['tokens'][language][token] += 1
|
33
|
+
db['language_tokens'][language] ||= 0
|
34
|
+
db['language_tokens'][language] += 1
|
35
|
+
db['tokens_total'] += 1
|
36
|
+
end
|
37
|
+
db['languages'][language] ||= 0
|
38
|
+
db['languages'][language] += 1
|
39
|
+
db['languages_total'] += 1
|
40
|
+
|
41
|
+
nil
|
42
|
+
end
|
43
|
+
|
44
|
+
# Public: Guess language of data.
|
45
|
+
#
|
46
|
+
# db - Hash of classifier tokens database.
|
47
|
+
# data - Array of tokens or String data to analyze.
|
48
|
+
# languages - Array of language name Strings to restrict to.
|
49
|
+
#
|
50
|
+
# Examples
|
51
|
+
#
|
52
|
+
# Classifier.classify(db, "def hello; end")
|
53
|
+
# # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
|
54
|
+
#
|
55
|
+
# Returns sorted Array of result pairs. Each pair contains the
|
56
|
+
# String language name and a Float score.
|
57
|
+
def self.classify(db, tokens, languages = nil)
|
58
|
+
languages ||= db['languages'].keys
|
59
|
+
new(db).classify(tokens, languages)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Internal: Initialize a Classifier.
|
63
|
+
def initialize(db = {})
|
64
|
+
@tokens_total = db['tokens_total']
|
65
|
+
@languages_total = db['languages_total']
|
66
|
+
@tokens = db['tokens']
|
67
|
+
@language_tokens = db['language_tokens']
|
68
|
+
@languages = db['languages']
|
69
|
+
end
|
70
|
+
|
71
|
+
# Internal: Guess language of data
|
72
|
+
#
|
73
|
+
# data - Array of tokens or String data to analyze.
|
74
|
+
# languages - Array of language name Strings to restrict to.
|
75
|
+
#
|
76
|
+
# Returns sorted Array of result pairs. Each pair contains the
|
77
|
+
# String language name and a Float score.
|
78
|
+
def classify(tokens, languages)
|
79
|
+
return [] if tokens.nil?
|
80
|
+
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
81
|
+
|
82
|
+
scores = {}
|
83
|
+
if verbosity >= 2
|
84
|
+
dump_all_tokens(tokens, languages)
|
85
|
+
end
|
86
|
+
languages.each do |language|
|
87
|
+
scores[language] = tokens_probability(tokens, language) +
|
88
|
+
language_probability(language)
|
89
|
+
if verbosity >= 1
|
90
|
+
printf "%10s = %10.3f + %7.3f = %10.3f\n",
|
91
|
+
language, tokens_probability(tokens, language), language_probability(language), scores[language]
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
|
96
|
+
end
|
97
|
+
|
98
|
+
# Internal: Probably of set of tokens in a language occurring - P(D | C)
|
99
|
+
#
|
100
|
+
# tokens - Array of String tokens.
|
101
|
+
# language - Language to check.
|
102
|
+
#
|
103
|
+
# Returns Float between 0.0 and 1.0.
|
104
|
+
def tokens_probability(tokens, language)
|
105
|
+
tokens.inject(0.0) do |sum, token|
|
106
|
+
sum += Math.log(token_probability(token, language))
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
# Internal: Probably of token in language occurring - P(F | C)
|
111
|
+
#
|
112
|
+
# token - String token.
|
113
|
+
# language - Language to check.
|
114
|
+
#
|
115
|
+
# Returns Float between 0.0 and 1.0.
|
116
|
+
def token_probability(token, language)
|
117
|
+
if @tokens[language][token].to_f == 0.0
|
118
|
+
1 / @tokens_total.to_f
|
119
|
+
else
|
120
|
+
@tokens[language][token].to_f / @language_tokens[language].to_f
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# Internal: Probably of a language occurring - P(C)
|
125
|
+
#
|
126
|
+
# language - Language to check.
|
127
|
+
#
|
128
|
+
# Returns Float between 0.0 and 1.0.
|
129
|
+
def language_probability(language)
|
130
|
+
Math.log(@languages[language].to_f / @languages_total.to_f)
|
131
|
+
end
|
132
|
+
|
133
|
+
private
|
134
|
+
def verbosity
|
135
|
+
@verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
|
136
|
+
end
|
137
|
+
|
138
|
+
# Internal: show a table of probabilities for each <token,language> pair.
|
139
|
+
#
|
140
|
+
# The number in each table entry is the number of "points" that each
|
141
|
+
# token contributes toward the belief that the file under test is a
|
142
|
+
# particular language. Points are additive.
|
143
|
+
#
|
144
|
+
# Points are the number of times a token appears in the file, times
|
145
|
+
# how much more likely (log of probability ratio) that token is to
|
146
|
+
# appear in one language vs. the least-likely language. Dashes
|
147
|
+
# indicate the least-likely language (and zero points) for each token.
|
148
|
+
def dump_all_tokens(tokens, languages)
|
149
|
+
maxlen = tokens.map { |tok| tok.size }.max
|
150
|
+
|
151
|
+
printf "%#{maxlen}s", ""
|
152
|
+
puts " #" + languages.map { |lang| sprintf("%10s", lang) }.join
|
153
|
+
|
154
|
+
tokmap = Hash.new(0)
|
155
|
+
tokens.each { |tok| tokmap[tok] += 1 }
|
156
|
+
|
157
|
+
tokmap.sort.each { |tok, count|
|
158
|
+
arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
|
159
|
+
min = arr.map { |a,b| b }.min
|
160
|
+
minlog = Math.log(min)
|
161
|
+
if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
|
162
|
+
printf "%#{maxlen}s%5d", tok, count
|
163
|
+
|
164
|
+
puts arr.map { |ent|
|
165
|
+
ent[1] == min ? " -" : sprintf("%10.3f", count * (Math.log(ent[1]) - minlog))
|
166
|
+
}.join
|
167
|
+
end
|
168
|
+
}
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|