geothird-linguist 2.6.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 29e608de9f7d1f047fedc42252372a33c8f9af97
4
+ data.tar.gz: 184de1c9648189df496844f1b83299be88597ceb
5
+ SHA512:
6
+ metadata.gz: 28e2e56c28062cbb43bd9b54bc522512caf479459a065db81fccbb14d11a6e74060bb695b85b5b04b05325863b6388187581e7fe44f2b8573d6d0faa90f6c8ba
7
+ data.tar.gz: f561dd836463b6ea186fdc98ace219e0d8901a209875091ac6279e23375c0ab86346a96458b08aa462ba021675ed0310e685ecb6293a57b2ab48e9aca7c0d90f
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'linguist/file_blob'
4
+ require 'linguist/repository'
5
+
6
+ path = ARGV[0] || Dir.pwd
7
+
8
+ if File.directory?(path)
9
+ repo = Linguist::Repository.from_directory(path)
10
+ repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
11
+ percentage = ((size / repo.size.to_f) * 100).round
12
+ puts "%-4s %s" % ["#{percentage}%", language]
13
+ end
14
+ elsif File.file?(path)
15
+ blob = Linguist::FileBlob.new(path, Dir.pwd)
16
+ type = if blob.text?
17
+ 'Text'
18
+ elsif blob.image?
19
+ 'Image'
20
+ else
21
+ 'Binary'
22
+ end
23
+
24
+ puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
25
+ puts " type: #{type}"
26
+ puts " mime type: #{blob.mime_type}"
27
+ puts " language: #{blob.language}"
28
+
29
+ if blob.large?
30
+ puts " blob is too large to be shown"
31
+ end
32
+
33
+ if blob.generated?
34
+ puts " appears to be generated source code"
35
+ end
36
+
37
+ if blob.vendored?
38
+ puts " appears to be a vendored file"
39
+ end
40
+ else
41
+ abort "usage: linguist <path>"
42
+ end
@@ -0,0 +1,5 @@
1
+ require 'linguist/blob_helper'
2
+ require 'linguist/generated'
3
+ require 'linguist/language'
4
+ require 'linguist/repository'
5
+ require 'linguist/samples'
@@ -0,0 +1,360 @@
1
+ require 'linguist/generated'
2
+ require 'linguist/language'
3
+
4
+ require 'charlock_holmes'
5
+ require 'escape_utils'
6
+ require 'mime/types'
7
+ require 'pygments'
8
+ require 'yaml'
9
+
10
+ module Linguist
11
+ # BlobHelper is a mixin for Blobish classes that respond to "name",
12
+ # "data" and "size" such as Grit::Blob.
13
+ module BlobHelper
14
+ # Public: Get the extname of the path
15
+ #
16
+ # Examples
17
+ #
18
+ # blob(name='foo.rb').extname
19
+ # # => '.rb'
20
+ #
21
+ # Returns a String
22
+ def extname
23
+ File.extname(name.to_s)
24
+ end
25
+
26
+ # Internal: Lookup mime type for extension.
27
+ #
28
+ # Returns a MIME::Type
29
+ def _mime_type
30
+ if defined? @_mime_type
31
+ @_mime_type
32
+ else
33
+ guesses = ::MIME::Types.type_for(extname.to_s)
34
+
35
+ # Prefer text mime types over binary
36
+ @_mime_type = guesses.detect { |type| type.ascii? } ||
37
+ # Otherwise use the first guess
38
+ guesses.first
39
+ end
40
+ end
41
+
42
+ # Public: Get the actual blob mime type
43
+ #
44
+ # Examples
45
+ #
46
+ # # => 'text/plain'
47
+ # # => 'text/html'
48
+ #
49
+ # Returns a mime type String.
50
+ def mime_type
51
+ _mime_type ? _mime_type.to_s : 'text/plain'
52
+ end
53
+
54
+ # Internal: Is the blob binary according to its mime type
55
+ #
56
+ # Return true or false
57
+ def binary_mime_type?
58
+ _mime_type ? _mime_type.binary? : false
59
+ end
60
+
61
+ # Internal: Is the blob binary according to its mime type,
62
+ # overriding it if we have better data from the languages.yml
63
+ # database.
64
+ #
65
+ # Return true or false
66
+ def likely_binary?
67
+ binary_mime_type? and not Language.find_by_filename(name)
68
+ end
69
+
70
+ # Public: Get the Content-Type header value
71
+ #
72
+ # This value is used when serving raw blobs.
73
+ #
74
+ # Examples
75
+ #
76
+ # # => 'text/plain; charset=utf-8'
77
+ # # => 'application/octet-stream'
78
+ #
79
+ # Returns a content type String.
80
+ def content_type
81
+ @content_type ||= (binary_mime_type? || binary?) ? mime_type :
82
+ (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
83
+ end
84
+
85
+ # Public: Get the Content-Disposition header value
86
+ #
87
+ # This value is used when serving raw blobs.
88
+ #
89
+ # # => "attachment; filename=file.tar"
90
+ # # => "inline"
91
+ #
92
+ # Returns a content disposition String.
93
+ def disposition
94
+ if text? || image?
95
+ 'inline'
96
+ elsif name.nil?
97
+ "attachment"
98
+ else
99
+ "attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
100
+ end
101
+ end
102
+
103
+ def encoding
104
+ if hash = detect_encoding
105
+ hash[:encoding]
106
+ end
107
+ end
108
+
109
+ # Try to guess the encoding
110
+ #
111
+ # Returns: a Hash, with :encoding, :confidence, :type
112
+ # this will return nil if an error occurred during detection or
113
+ # no valid encoding could be found
114
+ def detect_encoding
115
+ @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
116
+ end
117
+
118
+ # Public: Is the blob binary?
119
+ #
120
+ # Return true or false
121
+ def binary?
122
+ # Large blobs aren't even loaded into memory
123
+ if data.nil?
124
+ true
125
+
126
+ # Treat blank files as text
127
+ elsif data == ""
128
+ false
129
+
130
+ # Charlock doesn't know what to think
131
+ elsif encoding.nil?
132
+ true
133
+
134
+ # If Charlock says its binary
135
+ else
136
+ detect_encoding[:type] == :binary
137
+ end
138
+ end
139
+
140
+ # Public: Is the blob text?
141
+ #
142
+ # Return true or false
143
+ def text?
144
+ !binary?
145
+ end
146
+
147
+ # Public: Is the blob a supported image format?
148
+ #
149
+ # Return true or false
150
+ def image?
151
+ ['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
152
+ end
153
+
154
+ # Public: Is the blob a support 3D model format?
155
+ #
156
+ # Return true or false
157
+ def solid?
158
+ ['.stl', '.obj'].include?(extname)
159
+ end
160
+
161
+ MEGABYTE = 1024 * 1024
162
+
163
+ # Public: Is the blob too big to load?
164
+ #
165
+ # Return true or false
166
+ def large?
167
+ size.to_i > MEGABYTE
168
+ end
169
+
170
+ # Public: Is the blob safe to colorize?
171
+ #
172
+ # We use Pygments.rb for syntax highlighting blobs, which
173
+ # has some quirks and also is essentially 'un-killable' via
174
+ # normal timeout. To workaround this we try to
175
+ # carefully handling Pygments.rb anything it can't handle.
176
+ #
177
+ # Return true or false
178
+ def safe_to_colorize?
179
+ !large? && text? && !high_ratio_of_long_lines?
180
+ end
181
+
182
+ # Internal: Does the blob have a ratio of long lines?
183
+ #
184
+ # These types of files are usually going to make Pygments.rb
185
+ # angry if we try to colorize them.
186
+ #
187
+ # Return true or false
188
+ def high_ratio_of_long_lines?
189
+ return false if loc == 0
190
+ size / loc > 5000
191
+ end
192
+
193
+ # Public: Is the blob viewable?
194
+ #
195
+ # Non-viewable blobs will just show a "View Raw" link
196
+ #
197
+ # Return true or false
198
+ def viewable?
199
+ !large? && text?
200
+ end
201
+
202
+ vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
203
+ VendoredRegexp = Regexp.new(vendored_paths.join('|'))
204
+
205
+ # Public: Is the blob in a vendored directory?
206
+ #
207
+ # Vendored files are ignored by language statistics.
208
+ #
209
+ # See "vendor.yml" for a list of vendored conventions that match
210
+ # this pattern.
211
+ #
212
+ # Return true or false
213
+ def vendored?
214
+ name =~ VendoredRegexp ? true : false
215
+ end
216
+
217
+ # Public: Get each line of data
218
+ #
219
+ # Requires Blob#data
220
+ #
221
+ # Returns an Array of lines
222
+ def lines
223
+ @lines ||=
224
+ if viewable? && data
225
+ data.split(line_split_character, -1)
226
+ else
227
+ []
228
+ end
229
+ end
230
+
231
+ # Character used to split lines. This is almost always "\n" except when Mac
232
+ # Format is detected in which case it's "\r".
233
+ #
234
+ # Returns a split pattern string.
235
+ def line_split_character
236
+ @line_split_character ||= (mac_format?? "\r" : "\n")
237
+ end
238
+
239
+ # Public: Is the data in ** Mac Format **. This format uses \r (0x0d) characters
240
+ # for line ends and does not include a \n (0x0a).
241
+ #
242
+ # Returns true when mac format is detected.
243
+ def mac_format?
244
+ return if !viewable?
245
+ if pos = data[0, 4096].index("\r")
246
+ data[pos + 1] != ?\n
247
+ end
248
+ end
249
+
250
+ # Public: Get number of lines of code
251
+ #
252
+ # Requires Blob#data
253
+ #
254
+ # Returns Integer
255
+ def loc
256
+ lines.size
257
+ end
258
+
259
+ # Public: Get number of source lines of code
260
+ #
261
+ # Requires Blob#data
262
+ #
263
+ # Returns Integer
264
+ def sloc
265
+ lines.grep(/\S/).size
266
+ end
267
+
268
+ # Public: Is the blob a generated file?
269
+ #
270
+ # Generated source code is supressed in diffs and is ignored by
271
+ # language statistics.
272
+ #
273
+ # May load Blob#data
274
+ #
275
+ # Return true or false
276
+ def generated?
277
+ @_generated ||= Generated.generated?(name, lambda { data })
278
+ end
279
+
280
+ # Public: Should the blob be indexed for searching?
281
+ #
282
+ # Excluded:
283
+ # - Files over 0.1MB
284
+ # - Non-text files
285
+ # - Langauges marked as not searchable
286
+ # - Generated source files
287
+ #
288
+ # Please add additional test coverage to
289
+ # `test/test_blob.rb#test_indexable` if you make any changes.
290
+ #
291
+ # Return true or false
292
+ def indexable?
293
+ if size > 100 * 1024
294
+ false
295
+ elsif binary?
296
+ false
297
+ elsif extname == '.txt'
298
+ true
299
+ elsif language.nil?
300
+ false
301
+ elsif !language.searchable?
302
+ false
303
+ elsif generated?
304
+ false
305
+ else
306
+ true
307
+ end
308
+ end
309
+
310
+ # Public: Detects the Language of the blob.
311
+ #
312
+ # May load Blob#data
313
+ #
314
+ # Returns a Language or nil if none is detected
315
+ def language
316
+ return @language if defined? @language
317
+
318
+ if defined?(@data) && @data.is_a?(String)
319
+ data = @data
320
+ else
321
+ data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
322
+ end
323
+
324
+ @language = Language.detect(name.to_s, data, mode)
325
+ end
326
+
327
+ # Internal: Get the lexer of the blob.
328
+ #
329
+ # Returns a Lexer.
330
+ def lexer
331
+ language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
332
+ end
333
+
334
+ # Public: Highlight syntax of blob
335
+ #
336
+ # options - A Hash of options (defaults to {})
337
+ #
338
+ # Returns html String
339
+ def colorize(options = {})
340
+ return unless safe_to_colorize?
341
+ options[:options] ||= {}
342
+ options[:options][:encoding] ||= encoding
343
+ lexer.highlight(data, options)
344
+ end
345
+
346
+ # Public: Highlight syntax of blob without the outer highlight div
347
+ # wrapper.
348
+ #
349
+ # options - A Hash of options (defaults to {})
350
+ #
351
+ # Returns html String
352
+ def colorize_without_wrapper(options = {})
353
+ if text = colorize(options)
354
+ text[%r{<div class="highlight"><pre>(.*?)</pre>\s*</div>}m, 1]
355
+ else
356
+ ''
357
+ end
358
+ end
359
+ end
360
+ end
@@ -0,0 +1,123 @@
1
+ require 'linguist/tokenizer'
2
+
3
+ module Linguist
4
+ # Language bayesian classifier.
5
+ class Classifier
6
+ # Public: Train classifier that data is a certain language.
7
+ #
8
+ # db - Hash classifier database object
9
+ # language - String language of data
10
+ # data - String contents of file
11
+ #
12
+ # Examples
13
+ #
14
+ # Classifier.train(db, 'Ruby', "def hello; end")
15
+ #
16
+ # Returns nothing.
17
+ def self.train!(db, language, data)
18
+ tokens = Tokenizer.tokenize(data)
19
+
20
+ db['tokens_total'] ||= 0
21
+ db['languages_total'] ||= 0
22
+ db['tokens'] ||= {}
23
+ db['language_tokens'] ||= {}
24
+ db['languages'] ||= {}
25
+
26
+ tokens.each do |token|
27
+ db['tokens'][language] ||= {}
28
+ db['tokens'][language][token] ||= 0
29
+ db['tokens'][language][token] += 1
30
+ db['language_tokens'][language] ||= 0
31
+ db['language_tokens'][language] += 1
32
+ db['tokens_total'] += 1
33
+ end
34
+ db['languages'][language] ||= 0
35
+ db['languages'][language] += 1
36
+ db['languages_total'] += 1
37
+
38
+ nil
39
+ end
40
+
41
+ # Public: Guess language of data.
42
+ #
43
+ # db - Hash of classifer tokens database.
44
+ # data - Array of tokens or String data to analyze.
45
+ # languages - Array of language name Strings to restrict to.
46
+ #
47
+ # Examples
48
+ #
49
+ # Classifier.classify(db, "def hello; end")
50
+ # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
51
+ #
52
+ # Returns sorted Array of result pairs. Each pair contains the
53
+ # String language name and a Float score.
54
+ def self.classify(db, tokens, languages = nil)
55
+ languages ||= db['languages'].keys
56
+ new(db).classify(tokens, languages)
57
+ end
58
+
59
+ # Internal: Initialize a Classifier.
60
+ def initialize(db = {})
61
+ @tokens_total = db['tokens_total']
62
+ @languages_total = db['languages_total']
63
+ @tokens = db['tokens']
64
+ @language_tokens = db['language_tokens']
65
+ @languages = db['languages']
66
+ end
67
+
68
+ # Internal: Guess language of data
69
+ #
70
+ # data - Array of tokens or String data to analyze.
71
+ # languages - Array of language name Strings to restrict to.
72
+ #
73
+ # Returns sorted Array of result pairs. Each pair contains the
74
+ # String language name and a Float score.
75
+ def classify(tokens, languages)
76
+ return [] if tokens.nil?
77
+ tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
78
+
79
+ scores = {}
80
+ languages.each do |language|
81
+ scores[language] = tokens_probability(tokens, language) +
82
+ language_probability(language)
83
+ end
84
+
85
+ scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
86
+ end
87
+
88
+ # Internal: Probably of set of tokens in a language occuring - P(D | C)
89
+ #
90
+ # tokens - Array of String tokens.
91
+ # language - Language to check.
92
+ #
93
+ # Returns Float between 0.0 and 1.0.
94
+ def tokens_probability(tokens, language)
95
+ tokens.inject(0.0) do |sum, token|
96
+ sum += Math.log(token_probability(token, language))
97
+ end
98
+ end
99
+
100
+ # Internal: Probably of token in language occuring - P(F | C)
101
+ #
102
+ # token - String token.
103
+ # language - Language to check.
104
+ #
105
+ # Returns Float between 0.0 and 1.0.
106
+ def token_probability(token, language)
107
+ if @tokens[language][token].to_f == 0.0
108
+ 1 / @tokens_total.to_f
109
+ else
110
+ @tokens[language][token].to_f / @language_tokens[language].to_f
111
+ end
112
+ end
113
+
114
+ # Internal: Probably of a language occuring - P(C)
115
+ #
116
+ # language - Language to check.
117
+ #
118
+ # Returns Float between 0.0 and 1.0.
119
+ def language_probability(language)
120
+ Math.log(@languages[language].to_f / @languages_total.to_f)
121
+ end
122
+ end
123
+ end