ol-github-linguist 2.4.2.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a9c207f6a1dfc4eae287a1e95dc9dd6277d5de62
4
+ data.tar.gz: a12e690d63c1ed2024d0936d2fc129c0ef5634c2
5
+ SHA512:
6
+ metadata.gz: 6dbde1f11826f69c97d06ee6e2ad2412943302268cbba8c384c425ede0d9e58fcb600b818eecaa02ecd057e8b9c2f51d897bb6a9ad095bc73dc27eb9a1a4e74c
7
+ data.tar.gz: 8096db1ad76618186ef2b435f5242abb918e3a64222edc6a135a35f1e58650d7c9b71d74f5c0bb98048d8bdb463e4865e6473b335f5e51a64a5217d7aaebd693
data/bin/linguist ADDED
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'linguist/file_blob'
4
+ require 'linguist/repository'
5
+
6
+ path = ARGV[0] || Dir.pwd
7
+
8
+ if File.directory?(path)
9
+ repo = Linguist::Repository.from_directory(path)
10
+ repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
11
+ percentage = ((size / repo.size.to_f) * 100).round
12
+ puts "%-4s %s" % ["#{percentage}%", language]
13
+ end
14
+ elsif File.file?(path)
15
+ blob = Linguist::FileBlob.new(path, Dir.pwd)
16
+ type = if blob.text?
17
+ 'Text'
18
+ elsif blob.image?
19
+ 'Image'
20
+ else
21
+ 'Binary'
22
+ end
23
+
24
+ puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
25
+ puts " type: #{type}"
26
+ puts " mime type: #{blob.mime_type}"
27
+ puts " language: #{blob.language}"
28
+
29
+ if blob.large?
30
+ puts " blob is too large to be shown"
31
+ end
32
+
33
+ if blob.generated?
34
+ puts " appears to be generated source code"
35
+ end
36
+
37
+ if blob.vendored?
38
+ puts " appears to be a vendored file"
39
+ end
40
+ else
41
+ abort "usage: linguist <path>"
42
+ end
data/lib/linguist.rb ADDED
@@ -0,0 +1,5 @@
1
+ require 'linguist/blob_helper'
2
+ require 'linguist/generated'
3
+ require 'linguist/language'
4
+ require 'linguist/repository'
5
+ require 'linguist/samples'
@@ -0,0 +1,352 @@
1
+ require 'linguist/generated'
2
+ require 'linguist/language'
3
+
4
+ #require 'charlock_holmes'
5
+ #require 'escape_utils'
6
+ require 'mime/types'
7
+ #require 'pygments'
8
+ require 'yaml'
9
+
10
+ module Linguist
11
+ # BlobHelper is a mixin for Blobish classes that respond to "name",
12
+ # "data" and "size" such as Grit::Blob.
13
+ module BlobHelper
14
+ # Public: Get the extname of the path
15
+ #
16
+ # Examples
17
+ #
18
+ # blob(name='foo.rb').extname
19
+ # # => '.rb'
20
+ #
21
+ # Returns a String
22
+ def extname
23
+ File.extname(name.to_s)
24
+ end
25
+
26
+ # Internal: Lookup mime type for extension.
27
+ #
28
+ # Returns a MIME::Type
29
+ def _mime_type
30
+ if defined? @_mime_type
31
+ @_mime_type
32
+ else
33
+ guesses = ::MIME::Types.type_for(extname.to_s)
34
+
35
+ # Prefer text mime types over binary
36
+ @_mime_type = guesses.detect { |type| type.ascii? } ||
37
+ # Otherwise use the first guess
38
+ guesses.first
39
+ end
40
+ end
41
+
42
+ # Public: Get the actual blob mime type
43
+ #
44
+ # Examples
45
+ #
46
+ # # => 'text/plain'
47
+ # # => 'text/html'
48
+ #
49
+ # Returns a mime type String.
50
+ def mime_type
51
+ _mime_type ? _mime_type.to_s : 'text/plain'
52
+ end
53
+
54
+ # Internal: Is the blob binary according to its mime type
55
+ #
56
+ # Return true or false
57
+ def binary_mime_type?
58
+ _mime_type ? _mime_type.binary? : false
59
+ end
60
+
61
+ # Public: Get the Content-Type header value
62
+ #
63
+ # This value is used when serving raw blobs.
64
+ #
65
+ # Examples
66
+ #
67
+ # # => 'text/plain; charset=utf-8'
68
+ # # => 'application/octet-stream'
69
+ #
70
+ # Returns a content type String.
71
+ def content_type
72
+ @content_type ||= (binary_mime_type? || binary?) ? mime_type :
73
+ (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
74
+ end
75
+
76
+ # Public: Get the Content-Disposition header value
77
+ #
78
+ # This value is used when serving raw blobs.
79
+ #
80
+ # # => "attachment; filename=file.tar"
81
+ # # => "inline"
82
+ #
83
+ # Returns a content disposition String.
84
+ def disposition
85
+ if text? || image?
86
+ 'inline'
87
+ elsif name.nil?
88
+ "attachment"
89
+ else
90
+ #"attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
91
+ "attachment; filename=#{CGI.escape(File.basename(name))}"
92
+ end
93
+ end
94
+
95
+ def encoding
96
+ if hash = detect_encoding
97
+ hash[:encoding]
98
+ end
99
+ end
100
+
101
+ # Try to guess the encoding
102
+ #
103
+ # Returns: a Hash, with :encoding, :confidence, :type
104
+ # this will return nil if an error occurred during detection or
105
+ # no valid encoding could be found
106
+ def detect_encoding
107
+ nil # @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
108
+ end
109
+
110
+ # Public: Is the blob binary?
111
+ #
112
+ # Return true or false
113
+ def binary?
114
+ # Large blobs aren't even loaded into memory
115
+ if data.nil?
116
+ true
117
+
118
+ # Treat blank files as text
119
+ elsif data == ""
120
+ false
121
+
122
+ # Charlock doesn't know what to think
123
+ elsif encoding.nil?
124
+ true
125
+
126
+ # If Charlock says its binary
127
+ else
128
+ detect_encoding[:type] == :binary
129
+ end
130
+ end
131
+
132
+ # Public: Is the blob text?
133
+ #
134
+ # Return true or false
135
+ def text?
136
+ !binary?
137
+ end
138
+
139
+ # Public: Is the blob a supported image format?
140
+ #
141
+ # Return true or false
142
+ def image?
143
+ ['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
144
+ end
145
+
146
+ # Public: Is the blob a support 3D model format?
147
+ #
148
+ # Return true or false
149
+ def solid?
150
+ ['.stl', '.obj'].include?(extname)
151
+ end
152
+
153
+ MEGABYTE = 1024 * 1024
154
+
155
+ # Public: Is the blob too big to load?
156
+ #
157
+ # Return true or false
158
+ def large?
159
+ size.to_i > MEGABYTE
160
+ end
161
+
162
+ # Public: Is the blob safe to colorize?
163
+ #
164
+ # We use Pygments.rb for syntax highlighting blobs, which
165
+ # has some quirks and also is essentially 'un-killable' via
166
+ # normal timeout. To workaround this we try to
167
+ # carefully handling Pygments.rb anything it can't handle.
168
+ #
169
+ # Return true or false
170
+ def safe_to_colorize?
171
+ !large? && text? && !high_ratio_of_long_lines?
172
+ end
173
+
174
+ # Internal: Does the blob have a ratio of long lines?
175
+ #
176
+ # These types of files are usually going to make Pygments.rb
177
+ # angry if we try to colorize them.
178
+ #
179
+ # Return true or false
180
+ def high_ratio_of_long_lines?
181
+ return false if loc == 0
182
+ size / loc > 5000
183
+ end
184
+
185
+ # Public: Is the blob viewable?
186
+ #
187
+ # Non-viewable blobs will just show a "View Raw" link
188
+ #
189
+ # Return true or false
190
+ def viewable?
191
+ !large? && text?
192
+ end
193
+
194
+ vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
195
+ VendoredRegexp = Regexp.new(vendored_paths.join('|'))
196
+
197
+ # Public: Is the blob in a vendored directory?
198
+ #
199
+ # Vendored files are ignored by language statistics.
200
+ #
201
+ # See "vendor.yml" for a list of vendored conventions that match
202
+ # this pattern.
203
+ #
204
+ # Return true or false
205
+ def vendored?
206
+ name =~ VendoredRegexp ? true : false
207
+ end
208
+
209
+ # Public: Get each line of data
210
+ #
211
+ # Requires Blob#data
212
+ #
213
+ # Returns an Array of lines
214
+ def lines
215
+ @lines ||=
216
+ if viewable? && data
217
+ data.split(line_split_character, -1)
218
+ else
219
+ []
220
+ end
221
+ end
222
+
223
+ # Character used to split lines. This is almost always "\n" except when Mac
224
+ # Format is detected in which case it's "\r".
225
+ #
226
+ # Returns a split pattern string.
227
+ def line_split_character
228
+ @line_split_character ||= (mac_format?? "\r" : "\n")
229
+ end
230
+
231
+ # Public: Is the data in ** Mac Format **. This format uses \r (0x0d) characters
232
+ # for line ends and does not include a \n (0x0a).
233
+ #
234
+ # Returns true when mac format is detected.
235
+ def mac_format?
236
+ return if !viewable?
237
+ if pos = data[0, 4096].index("\r")
238
+ data[pos + 1] != ?\n
239
+ end
240
+ end
241
+
242
+ # Public: Get number of lines of code
243
+ #
244
+ # Requires Blob#data
245
+ #
246
+ # Returns Integer
247
+ def loc
248
+ lines.size
249
+ end
250
+
251
+ # Public: Get number of source lines of code
252
+ #
253
+ # Requires Blob#data
254
+ #
255
+ # Returns Integer
256
+ def sloc
257
+ lines.grep(/\S/).size
258
+ end
259
+
260
+ # Public: Is the blob a generated file?
261
+ #
262
+ # Generated source code is supressed in diffs and is ignored by
263
+ # language statistics.
264
+ #
265
+ # May load Blob#data
266
+ #
267
+ # Return true or false
268
+ def generated?
269
+ @_generated ||= Generated.generated?(name, lambda { data })
270
+ end
271
+
272
+ # Public: Should the blob be indexed for searching?
273
+ #
274
+ # Excluded:
275
+ # - Files over 0.1MB
276
+ # - Non-text files
277
+ # - Langauges marked as not searchable
278
+ # - Generated source files
279
+ #
280
+ # Please add additional test coverage to
281
+ # `test/test_blob.rb#test_indexable` if you make any changes.
282
+ #
283
+ # Return true or false
284
+ def indexable?
285
+ if size > 100 * 1024
286
+ false
287
+ elsif binary?
288
+ false
289
+ elsif extname == '.txt'
290
+ true
291
+ elsif language.nil?
292
+ false
293
+ elsif !language.searchable?
294
+ false
295
+ elsif generated?
296
+ false
297
+ else
298
+ true
299
+ end
300
+ end
301
+
302
+ # Public: Detects the Language of the blob.
303
+ #
304
+ # May load Blob#data
305
+ #
306
+ # Returns a Language or nil if none is detected
307
+ def language
308
+ return @language if defined? @language
309
+
310
+ if defined?(@data) && @data.is_a?(String)
311
+ data = @data
312
+ else
313
+ data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
314
+ end
315
+
316
+ @language = Language.detect(name.to_s, data, mode)
317
+ end
318
+
319
+ # Internal: Get the lexer of the blob.
320
+ #
321
+ # Returns a Lexer.
322
+ def lexer
323
+ language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
324
+ end
325
+
326
+ # Public: Highlight syntax of blob
327
+ #
328
+ # options - A Hash of options (defaults to {})
329
+ #
330
+ # Returns html String
331
+ def colorize(options = {})
332
+ return unless safe_to_colorize?
333
+ options[:options] ||= {}
334
+ options[:options][:encoding] ||= encoding
335
+ lexer.highlight(data, options)
336
+ end
337
+
338
+ # Public: Highlight syntax of blob without the outer highlight div
339
+ # wrapper.
340
+ #
341
+ # options - A Hash of options (defaults to {})
342
+ #
343
+ # Returns html String
344
+ def colorize_without_wrapper(options = {})
345
+ if text = colorize(options)
346
+ text[%r{<div class="highlight"><pre>(.*?)</pre>\s*</div>}m, 1]
347
+ else
348
+ ''
349
+ end
350
+ end
351
+ end
352
+ end
@@ -0,0 +1,123 @@
1
+ require 'linguist/tokenizer'
2
+
3
+ module Linguist
4
+ # Language bayesian classifier.
5
+ class Classifier
6
+ # Public: Train classifier that data is a certain language.
7
+ #
8
+ # db - Hash classifier database object
9
+ # language - String language of data
10
+ # data - String contents of file
11
+ #
12
+ # Examples
13
+ #
14
+ # Classifier.train(db, 'Ruby', "def hello; end")
15
+ #
16
+ # Returns nothing.
17
+ def self.train!(db, language, data)
18
+ tokens = Tokenizer.tokenize(data)
19
+
20
+ db['tokens_total'] ||= 0
21
+ db['languages_total'] ||= 0
22
+ db['tokens'] ||= {}
23
+ db['language_tokens'] ||= {}
24
+ db['languages'] ||= {}
25
+
26
+ tokens.each do |token|
27
+ db['tokens'][language] ||= {}
28
+ db['tokens'][language][token] ||= 0
29
+ db['tokens'][language][token] += 1
30
+ db['language_tokens'][language] ||= 0
31
+ db['language_tokens'][language] += 1
32
+ db['tokens_total'] += 1
33
+ end
34
+ db['languages'][language] ||= 0
35
+ db['languages'][language] += 1
36
+ db['languages_total'] += 1
37
+
38
+ nil
39
+ end
40
+
41
+ # Public: Guess language of data.
42
+ #
43
+ # db - Hash of classifer tokens database.
44
+ # data - Array of tokens or String data to analyze.
45
+ # languages - Array of language name Strings to restrict to.
46
+ #
47
+ # Examples
48
+ #
49
+ # Classifier.classify(db, "def hello; end")
50
+ # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
51
+ #
52
+ # Returns sorted Array of result pairs. Each pair contains the
53
+ # String language name and a Float score.
54
+ def self.classify(db, tokens, languages = nil)
55
+ languages ||= db['languages'].keys
56
+ new(db).classify(tokens, languages)
57
+ end
58
+
59
+ # Internal: Initialize a Classifier.
60
+ def initialize(db = {})
61
+ @tokens_total = db['tokens_total']
62
+ @languages_total = db['languages_total']
63
+ @tokens = db['tokens']
64
+ @language_tokens = db['language_tokens']
65
+ @languages = db['languages']
66
+ end
67
+
68
+ # Internal: Guess language of data
69
+ #
70
+ # data - Array of tokens or String data to analyze.
71
+ # languages - Array of language name Strings to restrict to.
72
+ #
73
+ # Returns sorted Array of result pairs. Each pair contains the
74
+ # String language name and a Float score.
75
+ def classify(tokens, languages)
76
+ return [] if tokens.nil?
77
+ tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
78
+
79
+ scores = {}
80
+ languages.each do |language|
81
+ scores[language] = tokens_probability(tokens, language) +
82
+ language_probability(language)
83
+ end
84
+
85
+ scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
86
+ end
87
+
88
+ # Internal: Probably of set of tokens in a language occuring - P(D | C)
89
+ #
90
+ # tokens - Array of String tokens.
91
+ # language - Language to check.
92
+ #
93
+ # Returns Float between 0.0 and 1.0.
94
+ def tokens_probability(tokens, language)
95
+ tokens.inject(0.0) do |sum, token|
96
+ sum += Math.log(token_probability(token, language))
97
+ end
98
+ end
99
+
100
+ # Internal: Probably of token in language occuring - P(F | C)
101
+ #
102
+ # token - String token.
103
+ # language - Language to check.
104
+ #
105
+ # Returns Float between 0.0 and 1.0.
106
+ def token_probability(token, language)
107
+ if @tokens[language][token].to_f == 0.0
108
+ 1 / @tokens_total.to_f
109
+ else
110
+ @tokens[language][token].to_f / @language_tokens[language].to_f
111
+ end
112
+ end
113
+
114
+ # Internal: Probably of a language occuring - P(C)
115
+ #
116
+ # language - Language to check.
117
+ #
118
+ # Returns Float between 0.0 and 1.0.
119
+ def language_probability(language)
120
+ Math.log(@languages[language].to_f / @languages_total.to_f)
121
+ end
122
+ end
123
+ end