ol-github-linguist 2.4.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/linguist +42 -0
- data/lib/linguist.rb +5 -0
- data/lib/linguist/blob_helper.rb +352 -0
- data/lib/linguist/classifier.rb +123 -0
- data/lib/linguist/file_blob.rb +56 -0
- data/lib/linguist/generated.rb +162 -0
- data/lib/linguist/language.rb +483 -0
- data/lib/linguist/languages.yml +1302 -0
- data/lib/linguist/md5.rb +38 -0
- data/lib/linguist/popular.yml +29 -0
- data/lib/linguist/repository.rb +95 -0
- data/lib/linguist/samples.json +31082 -0
- data/lib/linguist/samples.rb +98 -0
- data/lib/linguist/tokenizer.rb +197 -0
- data/lib/linguist/vendor.yml +98 -0
- metadata +129 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a9c207f6a1dfc4eae287a1e95dc9dd6277d5de62
|
4
|
+
data.tar.gz: a12e690d63c1ed2024d0936d2fc129c0ef5634c2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 6dbde1f11826f69c97d06ee6e2ad2412943302268cbba8c384c425ede0d9e58fcb600b818eecaa02ecd057e8b9c2f51d897bb6a9ad095bc73dc27eb9a1a4e74c
|
7
|
+
data.tar.gz: 8096db1ad76618186ef2b435f5242abb918e3a64222edc6a135a35f1e58650d7c9b71d74f5c0bb98048d8bdb463e4865e6473b335f5e51a64a5217d7aaebd693
|
data/bin/linguist
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'linguist/file_blob'
|
4
|
+
require 'linguist/repository'
|
5
|
+
|
6
|
+
path = ARGV[0] || Dir.pwd
|
7
|
+
|
8
|
+
if File.directory?(path)
|
9
|
+
repo = Linguist::Repository.from_directory(path)
|
10
|
+
repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
|
11
|
+
percentage = ((size / repo.size.to_f) * 100).round
|
12
|
+
puts "%-4s %s" % ["#{percentage}%", language]
|
13
|
+
end
|
14
|
+
elsif File.file?(path)
|
15
|
+
blob = Linguist::FileBlob.new(path, Dir.pwd)
|
16
|
+
type = if blob.text?
|
17
|
+
'Text'
|
18
|
+
elsif blob.image?
|
19
|
+
'Image'
|
20
|
+
else
|
21
|
+
'Binary'
|
22
|
+
end
|
23
|
+
|
24
|
+
puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
|
25
|
+
puts " type: #{type}"
|
26
|
+
puts " mime type: #{blob.mime_type}"
|
27
|
+
puts " language: #{blob.language}"
|
28
|
+
|
29
|
+
if blob.large?
|
30
|
+
puts " blob is too large to be shown"
|
31
|
+
end
|
32
|
+
|
33
|
+
if blob.generated?
|
34
|
+
puts " appears to be generated source code"
|
35
|
+
end
|
36
|
+
|
37
|
+
if blob.vendored?
|
38
|
+
puts " appears to be a vendored file"
|
39
|
+
end
|
40
|
+
else
|
41
|
+
abort "usage: linguist <path>"
|
42
|
+
end
|
data/lib/linguist.rb
ADDED
@@ -0,0 +1,352 @@
|
|
1
|
+
require 'linguist/generated'
|
2
|
+
require 'linguist/language'
|
3
|
+
|
4
|
+
#require 'charlock_holmes'
|
5
|
+
#require 'escape_utils'
|
6
|
+
require 'mime/types'
|
7
|
+
#require 'pygments'
|
8
|
+
require 'yaml'
|
9
|
+
|
10
|
+
module Linguist
|
11
|
+
# BlobHelper is a mixin for Blobish classes that respond to "name",
|
12
|
+
# "data" and "size" such as Grit::Blob.
|
13
|
+
module BlobHelper
|
14
|
+
# Public: Get the extname of the path
|
15
|
+
#
|
16
|
+
# Examples
|
17
|
+
#
|
18
|
+
# blob(name='foo.rb').extname
|
19
|
+
# # => '.rb'
|
20
|
+
#
|
21
|
+
# Returns a String
|
22
|
+
def extname
|
23
|
+
File.extname(name.to_s)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Internal: Lookup mime type for extension.
|
27
|
+
#
|
28
|
+
# Returns a MIME::Type
|
29
|
+
def _mime_type
|
30
|
+
if defined? @_mime_type
|
31
|
+
@_mime_type
|
32
|
+
else
|
33
|
+
guesses = ::MIME::Types.type_for(extname.to_s)
|
34
|
+
|
35
|
+
# Prefer text mime types over binary
|
36
|
+
@_mime_type = guesses.detect { |type| type.ascii? } ||
|
37
|
+
# Otherwise use the first guess
|
38
|
+
guesses.first
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Public: Get the actual blob mime type
|
43
|
+
#
|
44
|
+
# Examples
|
45
|
+
#
|
46
|
+
# # => 'text/plain'
|
47
|
+
# # => 'text/html'
|
48
|
+
#
|
49
|
+
# Returns a mime type String.
|
50
|
+
def mime_type
|
51
|
+
_mime_type ? _mime_type.to_s : 'text/plain'
|
52
|
+
end
|
53
|
+
|
54
|
+
# Internal: Is the blob binary according to its mime type
|
55
|
+
#
|
56
|
+
# Return true or false
|
57
|
+
def binary_mime_type?
|
58
|
+
_mime_type ? _mime_type.binary? : false
|
59
|
+
end
|
60
|
+
|
61
|
+
# Public: Get the Content-Type header value
|
62
|
+
#
|
63
|
+
# This value is used when serving raw blobs.
|
64
|
+
#
|
65
|
+
# Examples
|
66
|
+
#
|
67
|
+
# # => 'text/plain; charset=utf-8'
|
68
|
+
# # => 'application/octet-stream'
|
69
|
+
#
|
70
|
+
# Returns a content type String.
|
71
|
+
def content_type
|
72
|
+
@content_type ||= (binary_mime_type? || binary?) ? mime_type :
|
73
|
+
(encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
|
74
|
+
end
|
75
|
+
|
76
|
+
# Public: Get the Content-Disposition header value
|
77
|
+
#
|
78
|
+
# This value is used when serving raw blobs.
|
79
|
+
#
|
80
|
+
# # => "attachment; filename=file.tar"
|
81
|
+
# # => "inline"
|
82
|
+
#
|
83
|
+
# Returns a content disposition String.
|
84
|
+
def disposition
|
85
|
+
if text? || image?
|
86
|
+
'inline'
|
87
|
+
elsif name.nil?
|
88
|
+
"attachment"
|
89
|
+
else
|
90
|
+
#"attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
|
91
|
+
"attachment; filename=#{CGI.escape(File.basename(name))}"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def encoding
|
96
|
+
if hash = detect_encoding
|
97
|
+
hash[:encoding]
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Try to guess the encoding
|
102
|
+
#
|
103
|
+
# Returns: a Hash, with :encoding, :confidence, :type
|
104
|
+
# this will return nil if an error occurred during detection or
|
105
|
+
# no valid encoding could be found
|
106
|
+
def detect_encoding
|
107
|
+
nil # @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
|
108
|
+
end
|
109
|
+
|
110
|
+
# Public: Is the blob binary?
|
111
|
+
#
|
112
|
+
# Return true or false
|
113
|
+
def binary?
|
114
|
+
# Large blobs aren't even loaded into memory
|
115
|
+
if data.nil?
|
116
|
+
true
|
117
|
+
|
118
|
+
# Treat blank files as text
|
119
|
+
elsif data == ""
|
120
|
+
false
|
121
|
+
|
122
|
+
# Charlock doesn't know what to think
|
123
|
+
elsif encoding.nil?
|
124
|
+
true
|
125
|
+
|
126
|
+
# If Charlock says its binary
|
127
|
+
else
|
128
|
+
detect_encoding[:type] == :binary
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# Public: Is the blob text?
|
133
|
+
#
|
134
|
+
# Return true or false
|
135
|
+
def text?
|
136
|
+
!binary?
|
137
|
+
end
|
138
|
+
|
139
|
+
# Public: Is the blob a supported image format?
|
140
|
+
#
|
141
|
+
# Return true or false
|
142
|
+
def image?
|
143
|
+
['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
|
144
|
+
end
|
145
|
+
|
146
|
+
# Public: Is the blob a support 3D model format?
|
147
|
+
#
|
148
|
+
# Return true or false
|
149
|
+
def solid?
|
150
|
+
['.stl', '.obj'].include?(extname)
|
151
|
+
end
|
152
|
+
|
153
|
+
MEGABYTE = 1024 * 1024
|
154
|
+
|
155
|
+
# Public: Is the blob too big to load?
|
156
|
+
#
|
157
|
+
# Return true or false
|
158
|
+
def large?
|
159
|
+
size.to_i > MEGABYTE
|
160
|
+
end
|
161
|
+
|
162
|
+
# Public: Is the blob safe to colorize?
|
163
|
+
#
|
164
|
+
# We use Pygments.rb for syntax highlighting blobs, which
|
165
|
+
# has some quirks and also is essentially 'un-killable' via
|
166
|
+
# normal timeout. To workaround this we try to
|
167
|
+
# carefully handling Pygments.rb anything it can't handle.
|
168
|
+
#
|
169
|
+
# Return true or false
|
170
|
+
def safe_to_colorize?
|
171
|
+
!large? && text? && !high_ratio_of_long_lines?
|
172
|
+
end
|
173
|
+
|
174
|
+
# Internal: Does the blob have a ratio of long lines?
|
175
|
+
#
|
176
|
+
# These types of files are usually going to make Pygments.rb
|
177
|
+
# angry if we try to colorize them.
|
178
|
+
#
|
179
|
+
# Return true or false
|
180
|
+
def high_ratio_of_long_lines?
|
181
|
+
return false if loc == 0
|
182
|
+
size / loc > 5000
|
183
|
+
end
|
184
|
+
|
185
|
+
# Public: Is the blob viewable?
|
186
|
+
#
|
187
|
+
# Non-viewable blobs will just show a "View Raw" link
|
188
|
+
#
|
189
|
+
# Return true or false
|
190
|
+
def viewable?
|
191
|
+
!large? && text?
|
192
|
+
end
|
193
|
+
|
194
|
+
vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
|
195
|
+
VendoredRegexp = Regexp.new(vendored_paths.join('|'))
|
196
|
+
|
197
|
+
# Public: Is the blob in a vendored directory?
|
198
|
+
#
|
199
|
+
# Vendored files are ignored by language statistics.
|
200
|
+
#
|
201
|
+
# See "vendor.yml" for a list of vendored conventions that match
|
202
|
+
# this pattern.
|
203
|
+
#
|
204
|
+
# Return true or false
|
205
|
+
def vendored?
|
206
|
+
name =~ VendoredRegexp ? true : false
|
207
|
+
end
|
208
|
+
|
209
|
+
# Public: Get each line of data
|
210
|
+
#
|
211
|
+
# Requires Blob#data
|
212
|
+
#
|
213
|
+
# Returns an Array of lines
|
214
|
+
def lines
|
215
|
+
@lines ||=
|
216
|
+
if viewable? && data
|
217
|
+
data.split(line_split_character, -1)
|
218
|
+
else
|
219
|
+
[]
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
# Character used to split lines. This is almost always "\n" except when Mac
|
224
|
+
# Format is detected in which case it's "\r".
|
225
|
+
#
|
226
|
+
# Returns a split pattern string.
|
227
|
+
def line_split_character
|
228
|
+
@line_split_character ||= (mac_format?? "\r" : "\n")
|
229
|
+
end
|
230
|
+
|
231
|
+
# Public: Is the data in ** Mac Format **. This format uses \r (0x0d) characters
|
232
|
+
# for line ends and does not include a \n (0x0a).
|
233
|
+
#
|
234
|
+
# Returns true when mac format is detected.
|
235
|
+
def mac_format?
|
236
|
+
return if !viewable?
|
237
|
+
if pos = data[0, 4096].index("\r")
|
238
|
+
data[pos + 1] != ?\n
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
# Public: Get number of lines of code
|
243
|
+
#
|
244
|
+
# Requires Blob#data
|
245
|
+
#
|
246
|
+
# Returns Integer
|
247
|
+
def loc
|
248
|
+
lines.size
|
249
|
+
end
|
250
|
+
|
251
|
+
# Public: Get number of source lines of code
|
252
|
+
#
|
253
|
+
# Requires Blob#data
|
254
|
+
#
|
255
|
+
# Returns Integer
|
256
|
+
def sloc
|
257
|
+
lines.grep(/\S/).size
|
258
|
+
end
|
259
|
+
|
260
|
+
# Public: Is the blob a generated file?
|
261
|
+
#
|
262
|
+
# Generated source code is supressed in diffs and is ignored by
|
263
|
+
# language statistics.
|
264
|
+
#
|
265
|
+
# May load Blob#data
|
266
|
+
#
|
267
|
+
# Return true or false
|
268
|
+
def generated?
|
269
|
+
@_generated ||= Generated.generated?(name, lambda { data })
|
270
|
+
end
|
271
|
+
|
272
|
+
# Public: Should the blob be indexed for searching?
|
273
|
+
#
|
274
|
+
# Excluded:
|
275
|
+
# - Files over 0.1MB
|
276
|
+
# - Non-text files
|
277
|
+
# - Langauges marked as not searchable
|
278
|
+
# - Generated source files
|
279
|
+
#
|
280
|
+
# Please add additional test coverage to
|
281
|
+
# `test/test_blob.rb#test_indexable` if you make any changes.
|
282
|
+
#
|
283
|
+
# Return true or false
|
284
|
+
def indexable?
|
285
|
+
if size > 100 * 1024
|
286
|
+
false
|
287
|
+
elsif binary?
|
288
|
+
false
|
289
|
+
elsif extname == '.txt'
|
290
|
+
true
|
291
|
+
elsif language.nil?
|
292
|
+
false
|
293
|
+
elsif !language.searchable?
|
294
|
+
false
|
295
|
+
elsif generated?
|
296
|
+
false
|
297
|
+
else
|
298
|
+
true
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
# Public: Detects the Language of the blob.
|
303
|
+
#
|
304
|
+
# May load Blob#data
|
305
|
+
#
|
306
|
+
# Returns a Language or nil if none is detected
|
307
|
+
def language
|
308
|
+
return @language if defined? @language
|
309
|
+
|
310
|
+
if defined?(@data) && @data.is_a?(String)
|
311
|
+
data = @data
|
312
|
+
else
|
313
|
+
data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
|
314
|
+
end
|
315
|
+
|
316
|
+
@language = Language.detect(name.to_s, data, mode)
|
317
|
+
end
|
318
|
+
|
319
|
+
# Internal: Get the lexer of the blob.
|
320
|
+
#
|
321
|
+
# Returns a Lexer.
|
322
|
+
def lexer
|
323
|
+
language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
|
324
|
+
end
|
325
|
+
|
326
|
+
# Public: Highlight syntax of blob
|
327
|
+
#
|
328
|
+
# options - A Hash of options (defaults to {})
|
329
|
+
#
|
330
|
+
# Returns html String
|
331
|
+
def colorize(options = {})
|
332
|
+
return unless safe_to_colorize?
|
333
|
+
options[:options] ||= {}
|
334
|
+
options[:options][:encoding] ||= encoding
|
335
|
+
lexer.highlight(data, options)
|
336
|
+
end
|
337
|
+
|
338
|
+
# Public: Highlight syntax of blob without the outer highlight div
|
339
|
+
# wrapper.
|
340
|
+
#
|
341
|
+
# options - A Hash of options (defaults to {})
|
342
|
+
#
|
343
|
+
# Returns html String
|
344
|
+
def colorize_without_wrapper(options = {})
|
345
|
+
if text = colorize(options)
|
346
|
+
text[%r{<div class="highlight"><pre>(.*?)</pre>\s*</div>}m, 1]
|
347
|
+
else
|
348
|
+
''
|
349
|
+
end
|
350
|
+
end
|
351
|
+
end
|
352
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'linguist/tokenizer'
|
2
|
+
|
3
|
+
module Linguist
|
4
|
+
# Language bayesian classifier.
|
5
|
+
class Classifier
|
6
|
+
# Public: Train classifier that data is a certain language.
|
7
|
+
#
|
8
|
+
# db - Hash classifier database object
|
9
|
+
# language - String language of data
|
10
|
+
# data - String contents of file
|
11
|
+
#
|
12
|
+
# Examples
|
13
|
+
#
|
14
|
+
# Classifier.train(db, 'Ruby', "def hello; end")
|
15
|
+
#
|
16
|
+
# Returns nothing.
|
17
|
+
def self.train!(db, language, data)
|
18
|
+
tokens = Tokenizer.tokenize(data)
|
19
|
+
|
20
|
+
db['tokens_total'] ||= 0
|
21
|
+
db['languages_total'] ||= 0
|
22
|
+
db['tokens'] ||= {}
|
23
|
+
db['language_tokens'] ||= {}
|
24
|
+
db['languages'] ||= {}
|
25
|
+
|
26
|
+
tokens.each do |token|
|
27
|
+
db['tokens'][language] ||= {}
|
28
|
+
db['tokens'][language][token] ||= 0
|
29
|
+
db['tokens'][language][token] += 1
|
30
|
+
db['language_tokens'][language] ||= 0
|
31
|
+
db['language_tokens'][language] += 1
|
32
|
+
db['tokens_total'] += 1
|
33
|
+
end
|
34
|
+
db['languages'][language] ||= 0
|
35
|
+
db['languages'][language] += 1
|
36
|
+
db['languages_total'] += 1
|
37
|
+
|
38
|
+
nil
|
39
|
+
end
|
40
|
+
|
41
|
+
# Public: Guess language of data.
|
42
|
+
#
|
43
|
+
# db - Hash of classifer tokens database.
|
44
|
+
# data - Array of tokens or String data to analyze.
|
45
|
+
# languages - Array of language name Strings to restrict to.
|
46
|
+
#
|
47
|
+
# Examples
|
48
|
+
#
|
49
|
+
# Classifier.classify(db, "def hello; end")
|
50
|
+
# # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
|
51
|
+
#
|
52
|
+
# Returns sorted Array of result pairs. Each pair contains the
|
53
|
+
# String language name and a Float score.
|
54
|
+
def self.classify(db, tokens, languages = nil)
|
55
|
+
languages ||= db['languages'].keys
|
56
|
+
new(db).classify(tokens, languages)
|
57
|
+
end
|
58
|
+
|
59
|
+
# Internal: Initialize a Classifier.
|
60
|
+
def initialize(db = {})
|
61
|
+
@tokens_total = db['tokens_total']
|
62
|
+
@languages_total = db['languages_total']
|
63
|
+
@tokens = db['tokens']
|
64
|
+
@language_tokens = db['language_tokens']
|
65
|
+
@languages = db['languages']
|
66
|
+
end
|
67
|
+
|
68
|
+
# Internal: Guess language of data
|
69
|
+
#
|
70
|
+
# data - Array of tokens or String data to analyze.
|
71
|
+
# languages - Array of language name Strings to restrict to.
|
72
|
+
#
|
73
|
+
# Returns sorted Array of result pairs. Each pair contains the
|
74
|
+
# String language name and a Float score.
|
75
|
+
def classify(tokens, languages)
|
76
|
+
return [] if tokens.nil?
|
77
|
+
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
78
|
+
|
79
|
+
scores = {}
|
80
|
+
languages.each do |language|
|
81
|
+
scores[language] = tokens_probability(tokens, language) +
|
82
|
+
language_probability(language)
|
83
|
+
end
|
84
|
+
|
85
|
+
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
|
86
|
+
end
|
87
|
+
|
88
|
+
# Internal: Probably of set of tokens in a language occuring - P(D | C)
|
89
|
+
#
|
90
|
+
# tokens - Array of String tokens.
|
91
|
+
# language - Language to check.
|
92
|
+
#
|
93
|
+
# Returns Float between 0.0 and 1.0.
|
94
|
+
def tokens_probability(tokens, language)
|
95
|
+
tokens.inject(0.0) do |sum, token|
|
96
|
+
sum += Math.log(token_probability(token, language))
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Internal: Probably of token in language occuring - P(F | C)
|
101
|
+
#
|
102
|
+
# token - String token.
|
103
|
+
# language - Language to check.
|
104
|
+
#
|
105
|
+
# Returns Float between 0.0 and 1.0.
|
106
|
+
def token_probability(token, language)
|
107
|
+
if @tokens[language][token].to_f == 0.0
|
108
|
+
1 / @tokens_total.to_f
|
109
|
+
else
|
110
|
+
@tokens[language][token].to_f / @language_tokens[language].to_f
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
# Internal: Probably of a language occuring - P(C)
|
115
|
+
#
|
116
|
+
# language - Language to check.
|
117
|
+
#
|
118
|
+
# Returns Float between 0.0 and 1.0.
|
119
|
+
def language_probability(language)
|
120
|
+
Math.log(@languages[language].to_f / @languages_total.to_f)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|