geothird-linguist 2.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/linguist +42 -0
- data/lib/linguist.rb +5 -0
- data/lib/linguist/blob_helper.rb +360 -0
- data/lib/linguist/classifier.rb +123 -0
- data/lib/linguist/file_blob.rb +56 -0
- data/lib/linguist/generated.rb +175 -0
- data/lib/linguist/language.rb +481 -0
- data/lib/linguist/languages.yml +1403 -0
- data/lib/linguist/md5.rb +38 -0
- data/lib/linguist/popular.yml +29 -0
- data/lib/linguist/repository.rb +95 -0
- data/lib/linguist/samples.json +32050 -0
- data/lib/linguist/samples.rb +98 -0
- data/lib/linguist/tokenizer.rb +197 -0
- data/lib/linguist/vendor.yml +106 -0
- metadata +170 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 29e608de9f7d1f047fedc42252372a33c8f9af97
|
4
|
+
data.tar.gz: 184de1c9648189df496844f1b83299be88597ceb
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 28e2e56c28062cbb43bd9b54bc522512caf479459a065db81fccbb14d11a6e74060bb695b85b5b04b05325863b6388187581e7fe44f2b8573d6d0faa90f6c8ba
|
7
|
+
data.tar.gz: f561dd836463b6ea186fdc98ace219e0d8901a209875091ac6279e23375c0ab86346a96458b08aa462ba021675ed0310e685ecb6293a57b2ab48e9aca7c0d90f
|
data/bin/linguist
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'linguist/file_blob'
|
4
|
+
require 'linguist/repository'
|
5
|
+
|
6
|
+
path = ARGV[0] || Dir.pwd
|
7
|
+
|
8
|
+
if File.directory?(path)
|
9
|
+
repo = Linguist::Repository.from_directory(path)
|
10
|
+
repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
|
11
|
+
percentage = ((size / repo.size.to_f) * 100).round
|
12
|
+
puts "%-4s %s" % ["#{percentage}%", language]
|
13
|
+
end
|
14
|
+
elsif File.file?(path)
|
15
|
+
blob = Linguist::FileBlob.new(path, Dir.pwd)
|
16
|
+
type = if blob.text?
|
17
|
+
'Text'
|
18
|
+
elsif blob.image?
|
19
|
+
'Image'
|
20
|
+
else
|
21
|
+
'Binary'
|
22
|
+
end
|
23
|
+
|
24
|
+
puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
|
25
|
+
puts " type: #{type}"
|
26
|
+
puts " mime type: #{blob.mime_type}"
|
27
|
+
puts " language: #{blob.language}"
|
28
|
+
|
29
|
+
if blob.large?
|
30
|
+
puts " blob is too large to be shown"
|
31
|
+
end
|
32
|
+
|
33
|
+
if blob.generated?
|
34
|
+
puts " appears to be generated source code"
|
35
|
+
end
|
36
|
+
|
37
|
+
if blob.vendored?
|
38
|
+
puts " appears to be a vendored file"
|
39
|
+
end
|
40
|
+
else
|
41
|
+
abort "usage: linguist <path>"
|
42
|
+
end
|
data/lib/linguist.rb
ADDED
@@ -0,0 +1,360 @@
|
|
1
|
+
require 'linguist/generated'
|
2
|
+
require 'linguist/language'
|
3
|
+
|
4
|
+
require 'charlock_holmes'
|
5
|
+
require 'escape_utils'
|
6
|
+
require 'mime/types'
|
7
|
+
require 'pygments'
|
8
|
+
require 'yaml'
|
9
|
+
|
10
|
+
module Linguist
|
11
|
+
# BlobHelper is a mixin for Blobish classes that respond to "name",
|
12
|
+
# "data" and "size" such as Grit::Blob.
|
13
|
+
module BlobHelper
|
14
|
+
# Public: Get the extname of the path
|
15
|
+
#
|
16
|
+
# Examples
|
17
|
+
#
|
18
|
+
# blob(name='foo.rb').extname
|
19
|
+
# # => '.rb'
|
20
|
+
#
|
21
|
+
# Returns a String
|
22
|
+
def extname
|
23
|
+
File.extname(name.to_s)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Internal: Lookup mime type for extension.
|
27
|
+
#
|
28
|
+
# Returns a MIME::Type
|
29
|
+
def _mime_type
|
30
|
+
if defined? @_mime_type
|
31
|
+
@_mime_type
|
32
|
+
else
|
33
|
+
guesses = ::MIME::Types.type_for(extname.to_s)
|
34
|
+
|
35
|
+
# Prefer text mime types over binary
|
36
|
+
@_mime_type = guesses.detect { |type| type.ascii? } ||
|
37
|
+
# Otherwise use the first guess
|
38
|
+
guesses.first
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Public: Get the actual blob mime type
|
43
|
+
#
|
44
|
+
# Examples
|
45
|
+
#
|
46
|
+
# # => 'text/plain'
|
47
|
+
# # => 'text/html'
|
48
|
+
#
|
49
|
+
# Returns a mime type String.
|
50
|
+
def mime_type
|
51
|
+
_mime_type ? _mime_type.to_s : 'text/plain'
|
52
|
+
end
|
53
|
+
|
54
|
+
# Internal: Is the blob binary according to its mime type
|
55
|
+
#
|
56
|
+
# Return true or false
|
57
|
+
def binary_mime_type?
|
58
|
+
_mime_type ? _mime_type.binary? : false
|
59
|
+
end
|
60
|
+
|
61
|
+
# Internal: Is the blob binary according to its mime type,
|
62
|
+
# overriding it if we have better data from the languages.yml
|
63
|
+
# database.
|
64
|
+
#
|
65
|
+
# Return true or false
|
66
|
+
def likely_binary?
|
67
|
+
binary_mime_type? and not Language.find_by_filename(name)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Public: Get the Content-Type header value
|
71
|
+
#
|
72
|
+
# This value is used when serving raw blobs.
|
73
|
+
#
|
74
|
+
# Examples
|
75
|
+
#
|
76
|
+
# # => 'text/plain; charset=utf-8'
|
77
|
+
# # => 'application/octet-stream'
|
78
|
+
#
|
79
|
+
# Returns a content type String.
|
80
|
+
def content_type
|
81
|
+
@content_type ||= (binary_mime_type? || binary?) ? mime_type :
|
82
|
+
(encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
|
83
|
+
end
|
84
|
+
|
85
|
+
# Public: Get the Content-Disposition header value
|
86
|
+
#
|
87
|
+
# This value is used when serving raw blobs.
|
88
|
+
#
|
89
|
+
# # => "attachment; filename=file.tar"
|
90
|
+
# # => "inline"
|
91
|
+
#
|
92
|
+
# Returns a content disposition String.
|
93
|
+
def disposition
|
94
|
+
if text? || image?
|
95
|
+
'inline'
|
96
|
+
elsif name.nil?
|
97
|
+
"attachment"
|
98
|
+
else
|
99
|
+
"attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def encoding
|
104
|
+
if hash = detect_encoding
|
105
|
+
hash[:encoding]
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# Try to guess the encoding
|
110
|
+
#
|
111
|
+
# Returns: a Hash, with :encoding, :confidence, :type
|
112
|
+
# this will return nil if an error occurred during detection or
|
113
|
+
# no valid encoding could be found
|
114
|
+
def detect_encoding
|
115
|
+
@detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
|
116
|
+
end
|
117
|
+
|
118
|
+
# Public: Is the blob binary?
|
119
|
+
#
|
120
|
+
# Return true or false
|
121
|
+
def binary?
|
122
|
+
# Large blobs aren't even loaded into memory
|
123
|
+
if data.nil?
|
124
|
+
true
|
125
|
+
|
126
|
+
# Treat blank files as text
|
127
|
+
elsif data == ""
|
128
|
+
false
|
129
|
+
|
130
|
+
# Charlock doesn't know what to think
|
131
|
+
elsif encoding.nil?
|
132
|
+
true
|
133
|
+
|
134
|
+
# If Charlock says its binary
|
135
|
+
else
|
136
|
+
detect_encoding[:type] == :binary
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# Public: Is the blob text?
|
141
|
+
#
|
142
|
+
# Return true or false
|
143
|
+
def text?
|
144
|
+
!binary?
|
145
|
+
end
|
146
|
+
|
147
|
+
# Public: Is the blob a supported image format?
|
148
|
+
#
|
149
|
+
# Return true or false
|
150
|
+
def image?
|
151
|
+
['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
|
152
|
+
end
|
153
|
+
|
154
|
+
# Public: Is the blob a support 3D model format?
|
155
|
+
#
|
156
|
+
# Return true or false
|
157
|
+
def solid?
|
158
|
+
['.stl', '.obj'].include?(extname)
|
159
|
+
end
|
160
|
+
|
161
|
+
MEGABYTE = 1024 * 1024
|
162
|
+
|
163
|
+
# Public: Is the blob too big to load?
|
164
|
+
#
|
165
|
+
# Return true or false
|
166
|
+
def large?
|
167
|
+
size.to_i > MEGABYTE
|
168
|
+
end
|
169
|
+
|
170
|
+
# Public: Is the blob safe to colorize?
|
171
|
+
#
|
172
|
+
# We use Pygments.rb for syntax highlighting blobs, which
|
173
|
+
# has some quirks and also is essentially 'un-killable' via
|
174
|
+
# normal timeout. To workaround this we try to
|
175
|
+
# carefully handling Pygments.rb anything it can't handle.
|
176
|
+
#
|
177
|
+
# Return true or false
|
178
|
+
def safe_to_colorize?
|
179
|
+
!large? && text? && !high_ratio_of_long_lines?
|
180
|
+
end
|
181
|
+
|
182
|
+
# Internal: Does the blob have a ratio of long lines?
|
183
|
+
#
|
184
|
+
# These types of files are usually going to make Pygments.rb
|
185
|
+
# angry if we try to colorize them.
|
186
|
+
#
|
187
|
+
# Return true or false
|
188
|
+
def high_ratio_of_long_lines?
|
189
|
+
return false if loc == 0
|
190
|
+
size / loc > 5000
|
191
|
+
end
|
192
|
+
|
193
|
+
# Public: Is the blob viewable?
|
194
|
+
#
|
195
|
+
# Non-viewable blobs will just show a "View Raw" link
|
196
|
+
#
|
197
|
+
# Return true or false
|
198
|
+
def viewable?
|
199
|
+
!large? && text?
|
200
|
+
end
|
201
|
+
|
202
|
+
vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
|
203
|
+
VendoredRegexp = Regexp.new(vendored_paths.join('|'))
|
204
|
+
|
205
|
+
# Public: Is the blob in a vendored directory?
|
206
|
+
#
|
207
|
+
# Vendored files are ignored by language statistics.
|
208
|
+
#
|
209
|
+
# See "vendor.yml" for a list of vendored conventions that match
|
210
|
+
# this pattern.
|
211
|
+
#
|
212
|
+
# Return true or false
|
213
|
+
def vendored?
|
214
|
+
name =~ VendoredRegexp ? true : false
|
215
|
+
end
|
216
|
+
|
217
|
+
# Public: Get each line of data
|
218
|
+
#
|
219
|
+
# Requires Blob#data
|
220
|
+
#
|
221
|
+
# Returns an Array of lines
|
222
|
+
def lines
|
223
|
+
@lines ||=
|
224
|
+
if viewable? && data
|
225
|
+
data.split(line_split_character, -1)
|
226
|
+
else
|
227
|
+
[]
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
# Character used to split lines. This is almost always "\n" except when Mac
|
232
|
+
# Format is detected in which case it's "\r".
|
233
|
+
#
|
234
|
+
# Returns a split pattern string.
|
235
|
+
def line_split_character
|
236
|
+
@line_split_character ||= (mac_format?? "\r" : "\n")
|
237
|
+
end
|
238
|
+
|
239
|
+
# Public: Is the data in ** Mac Format **. This format uses \r (0x0d) characters
|
240
|
+
# for line ends and does not include a \n (0x0a).
|
241
|
+
#
|
242
|
+
# Returns true when mac format is detected.
|
243
|
+
def mac_format?
|
244
|
+
return if !viewable?
|
245
|
+
if pos = data[0, 4096].index("\r")
|
246
|
+
data[pos + 1] != ?\n
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
# Public: Get number of lines of code
|
251
|
+
#
|
252
|
+
# Requires Blob#data
|
253
|
+
#
|
254
|
+
# Returns Integer
|
255
|
+
def loc
|
256
|
+
lines.size
|
257
|
+
end
|
258
|
+
|
259
|
+
# Public: Get number of source lines of code
|
260
|
+
#
|
261
|
+
# Requires Blob#data
|
262
|
+
#
|
263
|
+
# Returns Integer
|
264
|
+
def sloc
|
265
|
+
lines.grep(/\S/).size
|
266
|
+
end
|
267
|
+
|
268
|
+
# Public: Is the blob a generated file?
|
269
|
+
#
|
270
|
+
# Generated source code is supressed in diffs and is ignored by
|
271
|
+
# language statistics.
|
272
|
+
#
|
273
|
+
# May load Blob#data
|
274
|
+
#
|
275
|
+
# Return true or false
|
276
|
+
def generated?
|
277
|
+
@_generated ||= Generated.generated?(name, lambda { data })
|
278
|
+
end
|
279
|
+
|
280
|
+
# Public: Should the blob be indexed for searching?
|
281
|
+
#
|
282
|
+
# Excluded:
|
283
|
+
# - Files over 0.1MB
|
284
|
+
# - Non-text files
|
285
|
+
# - Langauges marked as not searchable
|
286
|
+
# - Generated source files
|
287
|
+
#
|
288
|
+
# Please add additional test coverage to
|
289
|
+
# `test/test_blob.rb#test_indexable` if you make any changes.
|
290
|
+
#
|
291
|
+
# Return true or false
|
292
|
+
def indexable?
|
293
|
+
if size > 100 * 1024
|
294
|
+
false
|
295
|
+
elsif binary?
|
296
|
+
false
|
297
|
+
elsif extname == '.txt'
|
298
|
+
true
|
299
|
+
elsif language.nil?
|
300
|
+
false
|
301
|
+
elsif !language.searchable?
|
302
|
+
false
|
303
|
+
elsif generated?
|
304
|
+
false
|
305
|
+
else
|
306
|
+
true
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
# Public: Detects the Language of the blob.
|
311
|
+
#
|
312
|
+
# May load Blob#data
|
313
|
+
#
|
314
|
+
# Returns a Language or nil if none is detected
|
315
|
+
def language
|
316
|
+
return @language if defined? @language
|
317
|
+
|
318
|
+
if defined?(@data) && @data.is_a?(String)
|
319
|
+
data = @data
|
320
|
+
else
|
321
|
+
data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
|
322
|
+
end
|
323
|
+
|
324
|
+
@language = Language.detect(name.to_s, data, mode)
|
325
|
+
end
|
326
|
+
|
327
|
+
# Internal: Get the lexer of the blob.
|
328
|
+
#
|
329
|
+
# Returns a Lexer.
|
330
|
+
def lexer
|
331
|
+
language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
|
332
|
+
end
|
333
|
+
|
334
|
+
# Public: Highlight syntax of blob
|
335
|
+
#
|
336
|
+
# options - A Hash of options (defaults to {})
|
337
|
+
#
|
338
|
+
# Returns html String
|
339
|
+
def colorize(options = {})
|
340
|
+
return unless safe_to_colorize?
|
341
|
+
options[:options] ||= {}
|
342
|
+
options[:options][:encoding] ||= encoding
|
343
|
+
lexer.highlight(data, options)
|
344
|
+
end
|
345
|
+
|
346
|
+
# Public: Highlight syntax of blob without the outer highlight div
|
347
|
+
# wrapper.
|
348
|
+
#
|
349
|
+
# options - A Hash of options (defaults to {})
|
350
|
+
#
|
351
|
+
# Returns html String
|
352
|
+
def colorize_without_wrapper(options = {})
|
353
|
+
if text = colorize(options)
|
354
|
+
text[%r{<div class="highlight"><pre>(.*?)</pre>\s*</div>}m, 1]
|
355
|
+
else
|
356
|
+
''
|
357
|
+
end
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'linguist/tokenizer'
|
2
|
+
|
3
|
+
module Linguist
|
4
|
+
# Language bayesian classifier.
|
5
|
+
class Classifier
|
6
|
+
# Public: Train classifier that data is a certain language.
|
7
|
+
#
|
8
|
+
# db - Hash classifier database object
|
9
|
+
# language - String language of data
|
10
|
+
# data - String contents of file
|
11
|
+
#
|
12
|
+
# Examples
|
13
|
+
#
|
14
|
+
# Classifier.train(db, 'Ruby', "def hello; end")
|
15
|
+
#
|
16
|
+
# Returns nothing.
|
17
|
+
def self.train!(db, language, data)
|
18
|
+
tokens = Tokenizer.tokenize(data)
|
19
|
+
|
20
|
+
db['tokens_total'] ||= 0
|
21
|
+
db['languages_total'] ||= 0
|
22
|
+
db['tokens'] ||= {}
|
23
|
+
db['language_tokens'] ||= {}
|
24
|
+
db['languages'] ||= {}
|
25
|
+
|
26
|
+
tokens.each do |token|
|
27
|
+
db['tokens'][language] ||= {}
|
28
|
+
db['tokens'][language][token] ||= 0
|
29
|
+
db['tokens'][language][token] += 1
|
30
|
+
db['language_tokens'][language] ||= 0
|
31
|
+
db['language_tokens'][language] += 1
|
32
|
+
db['tokens_total'] += 1
|
33
|
+
end
|
34
|
+
db['languages'][language] ||= 0
|
35
|
+
db['languages'][language] += 1
|
36
|
+
db['languages_total'] += 1
|
37
|
+
|
38
|
+
nil
|
39
|
+
end
|
40
|
+
|
41
|
+
# Public: Guess language of data.
|
42
|
+
#
|
43
|
+
# db - Hash of classifer tokens database.
|
44
|
+
# data - Array of tokens or String data to analyze.
|
45
|
+
# languages - Array of language name Strings to restrict to.
|
46
|
+
#
|
47
|
+
# Examples
|
48
|
+
#
|
49
|
+
# Classifier.classify(db, "def hello; end")
|
50
|
+
# # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
|
51
|
+
#
|
52
|
+
# Returns sorted Array of result pairs. Each pair contains the
|
53
|
+
# String language name and a Float score.
|
54
|
+
def self.classify(db, tokens, languages = nil)
|
55
|
+
languages ||= db['languages'].keys
|
56
|
+
new(db).classify(tokens, languages)
|
57
|
+
end
|
58
|
+
|
59
|
+
# Internal: Initialize a Classifier.
|
60
|
+
def initialize(db = {})
|
61
|
+
@tokens_total = db['tokens_total']
|
62
|
+
@languages_total = db['languages_total']
|
63
|
+
@tokens = db['tokens']
|
64
|
+
@language_tokens = db['language_tokens']
|
65
|
+
@languages = db['languages']
|
66
|
+
end
|
67
|
+
|
68
|
+
# Internal: Guess language of data
|
69
|
+
#
|
70
|
+
# data - Array of tokens or String data to analyze.
|
71
|
+
# languages - Array of language name Strings to restrict to.
|
72
|
+
#
|
73
|
+
# Returns sorted Array of result pairs. Each pair contains the
|
74
|
+
# String language name and a Float score.
|
75
|
+
def classify(tokens, languages)
|
76
|
+
return [] if tokens.nil?
|
77
|
+
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
78
|
+
|
79
|
+
scores = {}
|
80
|
+
languages.each do |language|
|
81
|
+
scores[language] = tokens_probability(tokens, language) +
|
82
|
+
language_probability(language)
|
83
|
+
end
|
84
|
+
|
85
|
+
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
|
86
|
+
end
|
87
|
+
|
88
|
+
# Internal: Probably of set of tokens in a language occuring - P(D | C)
|
89
|
+
#
|
90
|
+
# tokens - Array of String tokens.
|
91
|
+
# language - Language to check.
|
92
|
+
#
|
93
|
+
# Returns Float between 0.0 and 1.0.
|
94
|
+
def tokens_probability(tokens, language)
|
95
|
+
tokens.inject(0.0) do |sum, token|
|
96
|
+
sum += Math.log(token_probability(token, language))
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Internal: Probably of token in language occuring - P(F | C)
|
101
|
+
#
|
102
|
+
# token - String token.
|
103
|
+
# language - Language to check.
|
104
|
+
#
|
105
|
+
# Returns Float between 0.0 and 1.0.
|
106
|
+
def token_probability(token, language)
|
107
|
+
if @tokens[language][token].to_f == 0.0
|
108
|
+
1 / @tokens_total.to_f
|
109
|
+
else
|
110
|
+
@tokens[language][token].to_f / @language_tokens[language].to_f
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
# Internal: Probably of a language occuring - P(C)
|
115
|
+
#
|
116
|
+
# language - Language to check.
|
117
|
+
#
|
118
|
+
# Returns Float between 0.0 and 1.0.
|
119
|
+
def language_probability(language)
|
120
|
+
Math.log(@languages[language].to_f / @languages_total.to_f)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|