geothird-linguist 2.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 29e608de9f7d1f047fedc42252372a33c8f9af97
4
+ data.tar.gz: 184de1c9648189df496844f1b83299be88597ceb
5
+ SHA512:
6
+ metadata.gz: 28e2e56c28062cbb43bd9b54bc522512caf479459a065db81fccbb14d11a6e74060bb695b85b5b04b05325863b6388187581e7fe44f2b8573d6d0faa90f6c8ba
7
+ data.tar.gz: f561dd836463b6ea186fdc98ace219e0d8901a209875091ac6279e23375c0ab86346a96458b08aa462ba021675ed0310e685ecb6293a57b2ab48e9aca7c0d90f
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'linguist/file_blob'
4
+ require 'linguist/repository'
5
+
6
+ path = ARGV[0] || Dir.pwd
7
+
8
+ if File.directory?(path)
9
+ repo = Linguist::Repository.from_directory(path)
10
+ repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
11
+ percentage = ((size / repo.size.to_f) * 100).round
12
+ puts "%-4s %s" % ["#{percentage}%", language]
13
+ end
14
+ elsif File.file?(path)
15
+ blob = Linguist::FileBlob.new(path, Dir.pwd)
16
+ type = if blob.text?
17
+ 'Text'
18
+ elsif blob.image?
19
+ 'Image'
20
+ else
21
+ 'Binary'
22
+ end
23
+
24
+ puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
25
+ puts " type: #{type}"
26
+ puts " mime type: #{blob.mime_type}"
27
+ puts " language: #{blob.language}"
28
+
29
+ if blob.large?
30
+ puts " blob is too large to be shown"
31
+ end
32
+
33
+ if blob.generated?
34
+ puts " appears to be generated source code"
35
+ end
36
+
37
+ if blob.vendored?
38
+ puts " appears to be a vendored file"
39
+ end
40
+ else
41
+ abort "usage: linguist <path>"
42
+ end
@@ -0,0 +1,5 @@
1
+ require 'linguist/blob_helper'
2
+ require 'linguist/generated'
3
+ require 'linguist/language'
4
+ require 'linguist/repository'
5
+ require 'linguist/samples'
@@ -0,0 +1,360 @@
1
+ require 'linguist/generated'
2
+ require 'linguist/language'
3
+
4
+ require 'charlock_holmes'
5
+ require 'escape_utils'
6
+ require 'mime/types'
7
+ require 'pygments'
8
+ require 'yaml'
9
+
10
+ module Linguist
11
+ # BlobHelper is a mixin for Blobish classes that respond to "name",
12
+ # "data" and "size" such as Grit::Blob.
13
+ module BlobHelper
14
+ # Public: Get the extname of the path
15
+ #
16
+ # Examples
17
+ #
18
+ # blob(name='foo.rb').extname
19
+ # # => '.rb'
20
+ #
21
+ # Returns a String
22
+ def extname
23
+ File.extname(name.to_s)
24
+ end
25
+
26
+ # Internal: Lookup mime type for extension.
27
+ #
28
+ # Returns a MIME::Type
29
+ def _mime_type
30
+ if defined? @_mime_type
31
+ @_mime_type
32
+ else
33
+ guesses = ::MIME::Types.type_for(extname.to_s)
34
+
35
+ # Prefer text mime types over binary
36
+ @_mime_type = guesses.detect { |type| type.ascii? } ||
37
+ # Otherwise use the first guess
38
+ guesses.first
39
+ end
40
+ end
41
+
42
+ # Public: Get the actual blob mime type
43
+ #
44
+ # Examples
45
+ #
46
+ # # => 'text/plain'
47
+ # # => 'text/html'
48
+ #
49
+ # Returns a mime type String.
50
+ def mime_type
51
+ _mime_type ? _mime_type.to_s : 'text/plain'
52
+ end
53
+
54
+ # Internal: Is the blob binary according to its mime type
55
+ #
56
+ # Return true or false
57
+ def binary_mime_type?
58
+ _mime_type ? _mime_type.binary? : false
59
+ end
60
+
61
+ # Internal: Is the blob binary according to its mime type,
62
+ # overriding it if we have better data from the languages.yml
63
+ # database.
64
+ #
65
+ # Return true or false
66
+ def likely_binary?
67
+ binary_mime_type? and not Language.find_by_filename(name)
68
+ end
69
+
70
+ # Public: Get the Content-Type header value
71
+ #
72
+ # This value is used when serving raw blobs.
73
+ #
74
+ # Examples
75
+ #
76
+ # # => 'text/plain; charset=utf-8'
77
+ # # => 'application/octet-stream'
78
+ #
79
+ # Returns a content type String.
80
+ def content_type
81
+ @content_type ||= (binary_mime_type? || binary?) ? mime_type :
82
+ (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
83
+ end
84
+
85
+ # Public: Get the Content-Disposition header value
86
+ #
87
+ # This value is used when serving raw blobs.
88
+ #
89
+ # # => "attachment; filename=file.tar"
90
+ # # => "inline"
91
+ #
92
+ # Returns a content disposition String.
93
+ def disposition
94
+ if text? || image?
95
+ 'inline'
96
+ elsif name.nil?
97
+ "attachment"
98
+ else
99
+ "attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
100
+ end
101
+ end
102
+
103
+ def encoding
104
+ if hash = detect_encoding
105
+ hash[:encoding]
106
+ end
107
+ end
108
+
109
+ # Try to guess the encoding
110
+ #
111
+ # Returns: a Hash, with :encoding, :confidence, :type
112
+ # this will return nil if an error occurred during detection or
113
+ # no valid encoding could be found
114
+ def detect_encoding
115
+ @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
116
+ end
117
+
118
+ # Public: Is the blob binary?
119
+ #
120
+ # Return true or false
121
+ def binary?
122
+ # Large blobs aren't even loaded into memory
123
+ if data.nil?
124
+ true
125
+
126
+ # Treat blank files as text
127
+ elsif data == ""
128
+ false
129
+
130
+ # Charlock doesn't know what to think
131
+ elsif encoding.nil?
132
+ true
133
+
134
+ # If Charlock says its binary
135
+ else
136
+ detect_encoding[:type] == :binary
137
+ end
138
+ end
139
+
140
+ # Public: Is the blob text?
141
+ #
142
+ # Return true or false
143
+ def text?
144
+ !binary?
145
+ end
146
+
147
+ # Public: Is the blob a supported image format?
148
+ #
149
+ # Return true or false
150
+ def image?
151
+ ['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
152
+ end
153
+
154
+ # Public: Is the blob a support 3D model format?
155
+ #
156
+ # Return true or false
157
+ def solid?
158
+ ['.stl', '.obj'].include?(extname)
159
+ end
160
+
161
+ MEGABYTE = 1024 * 1024
162
+
163
+ # Public: Is the blob too big to load?
164
+ #
165
+ # Return true or false
166
+ def large?
167
+ size.to_i > MEGABYTE
168
+ end
169
+
170
+ # Public: Is the blob safe to colorize?
171
+ #
172
+ # We use Pygments.rb for syntax highlighting blobs, which
173
+ # has some quirks and also is essentially 'un-killable' via
174
+ # normal timeout. To workaround this we try to
175
+ # carefully handling Pygments.rb anything it can't handle.
176
+ #
177
+ # Return true or false
178
+ def safe_to_colorize?
179
+ !large? && text? && !high_ratio_of_long_lines?
180
+ end
181
+
182
+ # Internal: Does the blob have a ratio of long lines?
183
+ #
184
+ # These types of files are usually going to make Pygments.rb
185
+ # angry if we try to colorize them.
186
+ #
187
+ # Return true or false
188
+ def high_ratio_of_long_lines?
189
+ return false if loc == 0
190
+ size / loc > 5000
191
+ end
192
+
193
+ # Public: Is the blob viewable?
194
+ #
195
+ # Non-viewable blobs will just show a "View Raw" link
196
+ #
197
+ # Return true or false
198
+ def viewable?
199
+ !large? && text?
200
+ end
201
+
202
+ vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
203
+ VendoredRegexp = Regexp.new(vendored_paths.join('|'))
204
+
205
+ # Public: Is the blob in a vendored directory?
206
+ #
207
+ # Vendored files are ignored by language statistics.
208
+ #
209
+ # See "vendor.yml" for a list of vendored conventions that match
210
+ # this pattern.
211
+ #
212
+ # Return true or false
213
+ def vendored?
214
+ name =~ VendoredRegexp ? true : false
215
+ end
216
+
217
+ # Public: Get each line of data
218
+ #
219
+ # Requires Blob#data
220
+ #
221
+ # Returns an Array of lines
222
+ def lines
223
+ @lines ||=
224
+ if viewable? && data
225
+ data.split(line_split_character, -1)
226
+ else
227
+ []
228
+ end
229
+ end
230
+
231
+ # Character used to split lines. This is almost always "\n" except when Mac
232
+ # Format is detected in which case it's "\r".
233
+ #
234
+ # Returns a split pattern string.
235
+ def line_split_character
236
+ @line_split_character ||= (mac_format?? "\r" : "\n")
237
+ end
238
+
239
+ # Public: Is the data in ** Mac Format **. This format uses \r (0x0d) characters
240
+ # for line ends and does not include a \n (0x0a).
241
+ #
242
+ # Returns true when mac format is detected.
243
+ def mac_format?
244
+ return if !viewable?
245
+ if pos = data[0, 4096].index("\r")
246
+ data[pos + 1] != ?\n
247
+ end
248
+ end
249
+
250
+ # Public: Get number of lines of code
251
+ #
252
+ # Requires Blob#data
253
+ #
254
+ # Returns Integer
255
+ def loc
256
+ lines.size
257
+ end
258
+
259
+ # Public: Get number of source lines of code
260
+ #
261
+ # Requires Blob#data
262
+ #
263
+ # Returns Integer
264
+ def sloc
265
+ lines.grep(/\S/).size
266
+ end
267
+
268
+ # Public: Is the blob a generated file?
269
+ #
270
+ # Generated source code is supressed in diffs and is ignored by
271
+ # language statistics.
272
+ #
273
+ # May load Blob#data
274
+ #
275
+ # Return true or false
276
+ def generated?
277
+ @_generated ||= Generated.generated?(name, lambda { data })
278
+ end
279
+
280
+ # Public: Should the blob be indexed for searching?
281
+ #
282
+ # Excluded:
283
+ # - Files over 0.1MB
284
+ # - Non-text files
285
+ # - Langauges marked as not searchable
286
+ # - Generated source files
287
+ #
288
+ # Please add additional test coverage to
289
+ # `test/test_blob.rb#test_indexable` if you make any changes.
290
+ #
291
+ # Return true or false
292
+ def indexable?
293
+ if size > 100 * 1024
294
+ false
295
+ elsif binary?
296
+ false
297
+ elsif extname == '.txt'
298
+ true
299
+ elsif language.nil?
300
+ false
301
+ elsif !language.searchable?
302
+ false
303
+ elsif generated?
304
+ false
305
+ else
306
+ true
307
+ end
308
+ end
309
+
310
+ # Public: Detects the Language of the blob.
311
+ #
312
+ # May load Blob#data
313
+ #
314
+ # Returns a Language or nil if none is detected
315
+ def language
316
+ return @language if defined? @language
317
+
318
+ if defined?(@data) && @data.is_a?(String)
319
+ data = @data
320
+ else
321
+ data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
322
+ end
323
+
324
+ @language = Language.detect(name.to_s, data, mode)
325
+ end
326
+
327
+ # Internal: Get the lexer of the blob.
328
+ #
329
+ # Returns a Lexer.
330
+ def lexer
331
+ language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
332
+ end
333
+
334
+ # Public: Highlight syntax of blob
335
+ #
336
+ # options - A Hash of options (defaults to {})
337
+ #
338
+ # Returns html String
339
+ def colorize(options = {})
340
+ return unless safe_to_colorize?
341
+ options[:options] ||= {}
342
+ options[:options][:encoding] ||= encoding
343
+ lexer.highlight(data, options)
344
+ end
345
+
346
+ # Public: Highlight syntax of blob without the outer highlight div
347
+ # wrapper.
348
+ #
349
+ # options - A Hash of options (defaults to {})
350
+ #
351
+ # Returns html String
352
+ def colorize_without_wrapper(options = {})
353
+ if text = colorize(options)
354
+ text[%r{<div class="highlight"><pre>(.*?)</pre>\s*</div>}m, 1]
355
+ else
356
+ ''
357
+ end
358
+ end
359
+ end
360
+ end
@@ -0,0 +1,123 @@
1
+ require 'linguist/tokenizer'
2
+
3
+ module Linguist
4
+ # Language bayesian classifier.
5
+ class Classifier
6
+ # Public: Train classifier that data is a certain language.
7
+ #
8
+ # db - Hash classifier database object
9
+ # language - String language of data
10
+ # data - String contents of file
11
+ #
12
+ # Examples
13
+ #
14
+ # Classifier.train(db, 'Ruby', "def hello; end")
15
+ #
16
+ # Returns nothing.
17
+ def self.train!(db, language, data)
18
+ tokens = Tokenizer.tokenize(data)
19
+
20
+ db['tokens_total'] ||= 0
21
+ db['languages_total'] ||= 0
22
+ db['tokens'] ||= {}
23
+ db['language_tokens'] ||= {}
24
+ db['languages'] ||= {}
25
+
26
+ tokens.each do |token|
27
+ db['tokens'][language] ||= {}
28
+ db['tokens'][language][token] ||= 0
29
+ db['tokens'][language][token] += 1
30
+ db['language_tokens'][language] ||= 0
31
+ db['language_tokens'][language] += 1
32
+ db['tokens_total'] += 1
33
+ end
34
+ db['languages'][language] ||= 0
35
+ db['languages'][language] += 1
36
+ db['languages_total'] += 1
37
+
38
+ nil
39
+ end
40
+
41
+ # Public: Guess language of data.
42
+ #
43
+ # db - Hash of classifer tokens database.
44
+ # data - Array of tokens or String data to analyze.
45
+ # languages - Array of language name Strings to restrict to.
46
+ #
47
+ # Examples
48
+ #
49
+ # Classifier.classify(db, "def hello; end")
50
+ # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
51
+ #
52
+ # Returns sorted Array of result pairs. Each pair contains the
53
+ # String language name and a Float score.
54
+ def self.classify(db, tokens, languages = nil)
55
+ languages ||= db['languages'].keys
56
+ new(db).classify(tokens, languages)
57
+ end
58
+
59
+ # Internal: Initialize a Classifier.
60
+ def initialize(db = {})
61
+ @tokens_total = db['tokens_total']
62
+ @languages_total = db['languages_total']
63
+ @tokens = db['tokens']
64
+ @language_tokens = db['language_tokens']
65
+ @languages = db['languages']
66
+ end
67
+
68
+ # Internal: Guess language of data
69
+ #
70
+ # data - Array of tokens or String data to analyze.
71
+ # languages - Array of language name Strings to restrict to.
72
+ #
73
+ # Returns sorted Array of result pairs. Each pair contains the
74
+ # String language name and a Float score.
75
+ def classify(tokens, languages)
76
+ return [] if tokens.nil?
77
+ tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
78
+
79
+ scores = {}
80
+ languages.each do |language|
81
+ scores[language] = tokens_probability(tokens, language) +
82
+ language_probability(language)
83
+ end
84
+
85
+ scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
86
+ end
87
+
88
+ # Internal: Probably of set of tokens in a language occuring - P(D | C)
89
+ #
90
+ # tokens - Array of String tokens.
91
+ # language - Language to check.
92
+ #
93
+ # Returns Float between 0.0 and 1.0.
94
+ def tokens_probability(tokens, language)
95
+ tokens.inject(0.0) do |sum, token|
96
+ sum += Math.log(token_probability(token, language))
97
+ end
98
+ end
99
+
100
+ # Internal: Probably of token in language occuring - P(F | C)
101
+ #
102
+ # token - String token.
103
+ # language - Language to check.
104
+ #
105
+ # Returns Float between 0.0 and 1.0.
106
+ def token_probability(token, language)
107
+ if @tokens[language][token].to_f == 0.0
108
+ 1 / @tokens_total.to_f
109
+ else
110
+ @tokens[language][token].to_f / @language_tokens[language].to_f
111
+ end
112
+ end
113
+
114
+ # Internal: Probably of a language occuring - P(C)
115
+ #
116
+ # language - Language to check.
117
+ #
118
+ # Returns Float between 0.0 and 1.0.
119
+ def language_probability(language)
120
+ Math.log(@languages[language].to_f / @languages_total.to_f)
121
+ end
122
+ end
123
+ end