ol-github-linguist 2.4.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a9c207f6a1dfc4eae287a1e95dc9dd6277d5de62
4
+ data.tar.gz: a12e690d63c1ed2024d0936d2fc129c0ef5634c2
5
+ SHA512:
6
+ metadata.gz: 6dbde1f11826f69c97d06ee6e2ad2412943302268cbba8c384c425ede0d9e58fcb600b818eecaa02ecd057e8b9c2f51d897bb6a9ad095bc73dc27eb9a1a4e74c
7
+ data.tar.gz: 8096db1ad76618186ef2b435f5242abb918e3a64222edc6a135a35f1e58650d7c9b71d74f5c0bb98048d8bdb463e4865e6473b335f5e51a64a5217d7aaebd693
data/bin/linguist ADDED
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'linguist/file_blob'
4
+ require 'linguist/repository'
5
+
6
+ path = ARGV[0] || Dir.pwd
7
+
8
+ if File.directory?(path)
9
+ repo = Linguist::Repository.from_directory(path)
10
+ repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
11
+ percentage = ((size / repo.size.to_f) * 100).round
12
+ puts "%-4s %s" % ["#{percentage}%", language]
13
+ end
14
+ elsif File.file?(path)
15
+ blob = Linguist::FileBlob.new(path, Dir.pwd)
16
+ type = if blob.text?
17
+ 'Text'
18
+ elsif blob.image?
19
+ 'Image'
20
+ else
21
+ 'Binary'
22
+ end
23
+
24
+ puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
25
+ puts " type: #{type}"
26
+ puts " mime type: #{blob.mime_type}"
27
+ puts " language: #{blob.language}"
28
+
29
+ if blob.large?
30
+ puts " blob is too large to be shown"
31
+ end
32
+
33
+ if blob.generated?
34
+ puts " appears to be generated source code"
35
+ end
36
+
37
+ if blob.vendored?
38
+ puts " appears to be a vendored file"
39
+ end
40
+ else
41
+ abort "usage: linguist <path>"
42
+ end
data/lib/linguist.rb ADDED
@@ -0,0 +1,5 @@
1
+ require 'linguist/blob_helper'
2
+ require 'linguist/generated'
3
+ require 'linguist/language'
4
+ require 'linguist/repository'
5
+ require 'linguist/samples'
@@ -0,0 +1,352 @@
1
+ require 'linguist/generated'
2
+ require 'linguist/language'
3
+
4
+ #require 'charlock_holmes'
5
+ #require 'escape_utils'
6
+ require 'mime/types'
7
+ #require 'pygments'
8
+ require 'yaml'
9
+
10
+ module Linguist
11
+ # BlobHelper is a mixin for Blobish classes that respond to "name",
12
+ # "data" and "size" such as Grit::Blob.
13
+ module BlobHelper
14
+ # Public: Get the extname of the path
15
+ #
16
+ # Examples
17
+ #
18
+ # blob(name='foo.rb').extname
19
+ # # => '.rb'
20
+ #
21
+ # Returns a String
22
+ def extname
23
+ File.extname(name.to_s)
24
+ end
25
+
26
+ # Internal: Lookup mime type for extension.
27
+ #
28
+ # Returns a MIME::Type
29
+ def _mime_type
30
+ if defined? @_mime_type
31
+ @_mime_type
32
+ else
33
+ guesses = ::MIME::Types.type_for(extname.to_s)
34
+
35
+ # Prefer text mime types over binary
36
+ @_mime_type = guesses.detect { |type| type.ascii? } ||
37
+ # Otherwise use the first guess
38
+ guesses.first
39
+ end
40
+ end
41
+
42
+ # Public: Get the actual blob mime type
43
+ #
44
+ # Examples
45
+ #
46
+ # # => 'text/plain'
47
+ # # => 'text/html'
48
+ #
49
+ # Returns a mime type String.
50
+ def mime_type
51
+ _mime_type ? _mime_type.to_s : 'text/plain'
52
+ end
53
+
54
+ # Internal: Is the blob binary according to its mime type
55
+ #
56
+ # Return true or false
57
+ def binary_mime_type?
58
+ _mime_type ? _mime_type.binary? : false
59
+ end
60
+
61
+ # Public: Get the Content-Type header value
62
+ #
63
+ # This value is used when serving raw blobs.
64
+ #
65
+ # Examples
66
+ #
67
+ # # => 'text/plain; charset=utf-8'
68
+ # # => 'application/octet-stream'
69
+ #
70
+ # Returns a content type String.
71
+ def content_type
72
+ @content_type ||= (binary_mime_type? || binary?) ? mime_type :
73
+ (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
74
+ end
75
+
76
+ # Public: Get the Content-Disposition header value
77
+ #
78
+ # This value is used when serving raw blobs.
79
+ #
80
+ # # => "attachment; filename=file.tar"
81
+ # # => "inline"
82
+ #
83
+ # Returns a content disposition String.
84
+ def disposition
85
+ if text? || image?
86
+ 'inline'
87
+ elsif name.nil?
88
+ "attachment"
89
+ else
90
+ #"attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
91
+ "attachment; filename=#{CGI.escape(File.basename(name))}"
92
+ end
93
+ end
94
+
95
+ def encoding
96
+ if hash = detect_encoding
97
+ hash[:encoding]
98
+ end
99
+ end
100
+
101
+ # Try to guess the encoding
102
+ #
103
+ # Returns: a Hash, with :encoding, :confidence, :type
104
+ # this will return nil if an error occurred during detection or
105
+ # no valid encoding could be found
106
+ def detect_encoding
107
+ nil # @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
108
+ end
109
+
110
+ # Public: Is the blob binary?
111
+ #
112
+ # Return true or false
113
+ def binary?
114
+ # Large blobs aren't even loaded into memory
115
+ if data.nil?
116
+ true
117
+
118
+ # Treat blank files as text
119
+ elsif data == ""
120
+ false
121
+
122
+ # Charlock doesn't know what to think
123
+ elsif encoding.nil?
124
+ true
125
+
126
+ # If Charlock says its binary
127
+ else
128
+ detect_encoding[:type] == :binary
129
+ end
130
+ end
131
+
132
+ # Public: Is the blob text?
133
+ #
134
+ # Return true or false
135
+ def text?
136
+ !binary?
137
+ end
138
+
139
+ # Public: Is the blob a supported image format?
140
+ #
141
+ # Return true or false
142
+ def image?
143
+ ['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
144
+ end
145
+
146
+ # Public: Is the blob a support 3D model format?
147
+ #
148
+ # Return true or false
149
+ def solid?
150
+ ['.stl', '.obj'].include?(extname)
151
+ end
152
+
153
+ MEGABYTE = 1024 * 1024
154
+
155
+ # Public: Is the blob too big to load?
156
+ #
157
+ # Return true or false
158
+ def large?
159
+ size.to_i > MEGABYTE
160
+ end
161
+
162
+ # Public: Is the blob safe to colorize?
163
+ #
164
+ # We use Pygments.rb for syntax highlighting blobs, which
165
+ # has some quirks and also is essentially 'un-killable' via
166
+ # normal timeout. To workaround this we try to
167
+ # carefully handling Pygments.rb anything it can't handle.
168
+ #
169
+ # Return true or false
170
+ def safe_to_colorize?
171
+ !large? && text? && !high_ratio_of_long_lines?
172
+ end
173
+
174
+ # Internal: Does the blob have a ratio of long lines?
175
+ #
176
+ # These types of files are usually going to make Pygments.rb
177
+ # angry if we try to colorize them.
178
+ #
179
+ # Return true or false
180
+ def high_ratio_of_long_lines?
181
+ return false if loc == 0
182
+ size / loc > 5000
183
+ end
184
+
185
+ # Public: Is the blob viewable?
186
+ #
187
+ # Non-viewable blobs will just show a "View Raw" link
188
+ #
189
+ # Return true or false
190
+ def viewable?
191
+ !large? && text?
192
+ end
193
+
194
+ vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
195
+ VendoredRegexp = Regexp.new(vendored_paths.join('|'))
196
+
197
+ # Public: Is the blob in a vendored directory?
198
+ #
199
+ # Vendored files are ignored by language statistics.
200
+ #
201
+ # See "vendor.yml" for a list of vendored conventions that match
202
+ # this pattern.
203
+ #
204
+ # Return true or false
205
+ def vendored?
206
+ name =~ VendoredRegexp ? true : false
207
+ end
208
+
209
+ # Public: Get each line of data
210
+ #
211
+ # Requires Blob#data
212
+ #
213
+ # Returns an Array of lines
214
+ def lines
215
+ @lines ||=
216
+ if viewable? && data
217
+ data.split(line_split_character, -1)
218
+ else
219
+ []
220
+ end
221
+ end
222
+
223
+ # Character used to split lines. This is almost always "\n" except when Mac
224
+ # Format is detected in which case it's "\r".
225
+ #
226
+ # Returns a split pattern string.
227
+ def line_split_character
228
+ @line_split_character ||= (mac_format?? "\r" : "\n")
229
+ end
230
+
231
+ # Public: Is the data in ** Mac Format **. This format uses \r (0x0d) characters
232
+ # for line ends and does not include a \n (0x0a).
233
+ #
234
+ # Returns true when mac format is detected.
235
+ def mac_format?
236
+ return if !viewable?
237
+ if pos = data[0, 4096].index("\r")
238
+ data[pos + 1] != ?\n
239
+ end
240
+ end
241
+
242
+ # Public: Get number of lines of code
243
+ #
244
+ # Requires Blob#data
245
+ #
246
+ # Returns Integer
247
+ def loc
248
+ lines.size
249
+ end
250
+
251
+ # Public: Get number of source lines of code
252
+ #
253
+ # Requires Blob#data
254
+ #
255
+ # Returns Integer
256
+ def sloc
257
+ lines.grep(/\S/).size
258
+ end
259
+
260
+ # Public: Is the blob a generated file?
261
+ #
262
+ # Generated source code is supressed in diffs and is ignored by
263
+ # language statistics.
264
+ #
265
+ # May load Blob#data
266
+ #
267
+ # Return true or false
268
+ def generated?
269
+ @_generated ||= Generated.generated?(name, lambda { data })
270
+ end
271
+
272
+ # Public: Should the blob be indexed for searching?
273
+ #
274
+ # Excluded:
275
+ # - Files over 0.1MB
276
+ # - Non-text files
277
+ # - Langauges marked as not searchable
278
+ # - Generated source files
279
+ #
280
+ # Please add additional test coverage to
281
+ # `test/test_blob.rb#test_indexable` if you make any changes.
282
+ #
283
+ # Return true or false
284
+ def indexable?
285
+ if size > 100 * 1024
286
+ false
287
+ elsif binary?
288
+ false
289
+ elsif extname == '.txt'
290
+ true
291
+ elsif language.nil?
292
+ false
293
+ elsif !language.searchable?
294
+ false
295
+ elsif generated?
296
+ false
297
+ else
298
+ true
299
+ end
300
+ end
301
+
302
+ # Public: Detects the Language of the blob.
303
+ #
304
+ # May load Blob#data
305
+ #
306
+ # Returns a Language or nil if none is detected
307
+ def language
308
+ return @language if defined? @language
309
+
310
+ if defined?(@data) && @data.is_a?(String)
311
+ data = @data
312
+ else
313
+ data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
314
+ end
315
+
316
+ @language = Language.detect(name.to_s, data, mode)
317
+ end
318
+
319
+ # Internal: Get the lexer of the blob.
320
+ #
321
+ # Returns a Lexer.
322
+ def lexer
323
+ language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
324
+ end
325
+
326
+ # Public: Highlight syntax of blob
327
+ #
328
+ # options - A Hash of options (defaults to {})
329
+ #
330
+ # Returns html String
331
+ def colorize(options = {})
332
+ return unless safe_to_colorize?
333
+ options[:options] ||= {}
334
+ options[:options][:encoding] ||= encoding
335
+ lexer.highlight(data, options)
336
+ end
337
+
338
+ # Public: Highlight syntax of blob without the outer highlight div
339
+ # wrapper.
340
+ #
341
+ # options - A Hash of options (defaults to {})
342
+ #
343
+ # Returns html String
344
+ def colorize_without_wrapper(options = {})
345
+ if text = colorize(options)
346
+ text[%r{<div class="highlight"><pre>(.*?)</pre>\s*</div>}m, 1]
347
+ else
348
+ ''
349
+ end
350
+ end
351
+ end
352
+ end
@@ -0,0 +1,123 @@
1
+ require 'linguist/tokenizer'
2
+
3
+ module Linguist
4
+ # Language bayesian classifier.
5
+ class Classifier
6
+ # Public: Train classifier that data is a certain language.
7
+ #
8
+ # db - Hash classifier database object
9
+ # language - String language of data
10
+ # data - String contents of file
11
+ #
12
+ # Examples
13
+ #
14
+ # Classifier.train(db, 'Ruby', "def hello; end")
15
+ #
16
+ # Returns nothing.
17
+ def self.train!(db, language, data)
18
+ tokens = Tokenizer.tokenize(data)
19
+
20
+ db['tokens_total'] ||= 0
21
+ db['languages_total'] ||= 0
22
+ db['tokens'] ||= {}
23
+ db['language_tokens'] ||= {}
24
+ db['languages'] ||= {}
25
+
26
+ tokens.each do |token|
27
+ db['tokens'][language] ||= {}
28
+ db['tokens'][language][token] ||= 0
29
+ db['tokens'][language][token] += 1
30
+ db['language_tokens'][language] ||= 0
31
+ db['language_tokens'][language] += 1
32
+ db['tokens_total'] += 1
33
+ end
34
+ db['languages'][language] ||= 0
35
+ db['languages'][language] += 1
36
+ db['languages_total'] += 1
37
+
38
+ nil
39
+ end
40
+
41
+ # Public: Guess language of data.
42
+ #
43
+ # db - Hash of classifer tokens database.
44
+ # data - Array of tokens or String data to analyze.
45
+ # languages - Array of language name Strings to restrict to.
46
+ #
47
+ # Examples
48
+ #
49
+ # Classifier.classify(db, "def hello; end")
50
+ # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
51
+ #
52
+ # Returns sorted Array of result pairs. Each pair contains the
53
+ # String language name and a Float score.
54
+ def self.classify(db, tokens, languages = nil)
55
+ languages ||= db['languages'].keys
56
+ new(db).classify(tokens, languages)
57
+ end
58
+
59
+ # Internal: Initialize a Classifier.
60
+ def initialize(db = {})
61
+ @tokens_total = db['tokens_total']
62
+ @languages_total = db['languages_total']
63
+ @tokens = db['tokens']
64
+ @language_tokens = db['language_tokens']
65
+ @languages = db['languages']
66
+ end
67
+
68
+ # Internal: Guess language of data
69
+ #
70
+ # data - Array of tokens or String data to analyze.
71
+ # languages - Array of language name Strings to restrict to.
72
+ #
73
+ # Returns sorted Array of result pairs. Each pair contains the
74
+ # String language name and a Float score.
75
+ def classify(tokens, languages)
76
+ return [] if tokens.nil?
77
+ tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
78
+
79
+ scores = {}
80
+ languages.each do |language|
81
+ scores[language] = tokens_probability(tokens, language) +
82
+ language_probability(language)
83
+ end
84
+
85
+ scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
86
+ end
87
+
88
+ # Internal: Probably of set of tokens in a language occuring - P(D | C)
89
+ #
90
+ # tokens - Array of String tokens.
91
+ # language - Language to check.
92
+ #
93
+ # Returns Float between 0.0 and 1.0.
94
+ def tokens_probability(tokens, language)
95
+ tokens.inject(0.0) do |sum, token|
96
+ sum += Math.log(token_probability(token, language))
97
+ end
98
+ end
99
+
100
+ # Internal: Probably of token in language occuring - P(F | C)
101
+ #
102
+ # token - String token.
103
+ # language - Language to check.
104
+ #
105
+ # Returns Float between 0.0 and 1.0.
106
+ def token_probability(token, language)
107
+ if @tokens[language][token].to_f == 0.0
108
+ 1 / @tokens_total.to_f
109
+ else
110
+ @tokens[language][token].to_f / @language_tokens[language].to_f
111
+ end
112
+ end
113
+
114
+ # Internal: Probably of a language occuring - P(C)
115
+ #
116
+ # language - Language to check.
117
+ #
118
+ # Returns Float between 0.0 and 1.0.
119
+ def language_probability(language)
120
+ Math.log(@languages[language].to_f / @languages_total.to_f)
121
+ end
122
+ end
123
+ end