gitlab-linguist 2.9.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1f61bbc6a1106207f7c4791dc3c4bcd83600fa59
4
+ data.tar.gz: 1433a3391e6247ba26603ba25a7028f3fea9a45f
5
+ SHA512:
6
+ metadata.gz: 9902468506da9cc6e5a8ddf684001c7e117a0285717aedb625cea00c7ba19f6689131fadd8a10e19cbe02bc666fd1e02bc49afe75b8e37fd8ce95184b43e6e61
7
+ data.tar.gz: 18eb029e57495598de5b8c9b8d9d630b9160d3c0b2c8d0db4ac212924d0f6952726ee9134a0288ae8a4fec4be529899fac88fa68d39c319ac19a60a3864ee720
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # linguist — detect language type for a file, or, given a directory, determine language breakdown
4
+ #
5
+ # usage: linguist <path>
6
+
7
+ require 'linguist/file_blob'
8
+ require 'linguist/repository'
9
+
10
+ path = ARGV[0] || Dir.pwd
11
+
12
+ if File.directory?(path)
13
+ repo = Linguist::Repository.from_directory(path)
14
+ repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
15
+ percentage = ((size / repo.size.to_f) * 100).round
16
+ puts "%-4s %s" % ["#{percentage}%", language]
17
+ end
18
+ elsif File.file?(path)
19
+ blob = Linguist::FileBlob.new(path, Dir.pwd)
20
+ type = if blob.text?
21
+ 'Text'
22
+ elsif blob.image?
23
+ 'Image'
24
+ else
25
+ 'Binary'
26
+ end
27
+
28
+ puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
29
+ puts " type: #{type}"
30
+ puts " mime type: #{blob.mime_type}"
31
+ puts " language: #{blob.language}"
32
+
33
+ if blob.large?
34
+ puts " blob is too large to be shown"
35
+ end
36
+
37
+ if blob.generated?
38
+ puts " appears to be generated source code"
39
+ end
40
+
41
+ if blob.vendored?
42
+ puts " appears to be a vendored file"
43
+ end
44
+ else
45
+ abort "usage: linguist <path>"
46
+ end
@@ -0,0 +1,5 @@
1
+ require 'linguist/blob_helper'
2
+ require 'linguist/generated'
3
+ require 'linguist/language'
4
+ require 'linguist/repository'
5
+ require 'linguist/samples'
@@ -0,0 +1,316 @@
1
+ require 'linguist/generated'
2
+ require 'linguist/language'
3
+
4
+ require 'charlock_holmes'
5
+ require 'escape_utils'
6
+ require 'mime/types'
7
+ require 'pygments'
8
+ require 'yaml'
9
+
10
+ module Linguist
11
+ # DEPRECATED Avoid mixing into Blob classes. Prefer functional interfaces
12
+ # like `Language.detect` over `Blob#language`. Functions are much easier to
13
+ # cache and compose.
14
+ #
15
+ # Avoid adding additional bloat to this module.
16
+ #
17
+ # BlobHelper is a mixin for Blobish classes that respond to "name",
18
+ # "data" and "size" such as Grit::Blob.
19
+ module BlobHelper
20
+ # Public: Get the extname of the path
21
+ #
22
+ # Examples
23
+ #
24
+ # blob(name='foo.rb').extname
25
+ # # => '.rb'
26
+ #
27
+ # Returns a String
28
+ def extname
29
+ File.extname(name.to_s)
30
+ end
31
+
32
+ # Internal: Lookup mime type for extension.
33
+ #
34
+ # Returns a MIME::Type
35
+ def _mime_type
36
+ if defined? @_mime_type
37
+ @_mime_type
38
+ else
39
+ guesses = ::MIME::Types.type_for(extname.to_s)
40
+
41
+ # Prefer text mime types over binary
42
+ @_mime_type = guesses.detect { |type| type.ascii? } ||
43
+ # Otherwise use the first guess
44
+ guesses.first
45
+ end
46
+ end
47
+
48
+ # Public: Get the actual blob mime type
49
+ #
50
+ # Examples
51
+ #
52
+ # # => 'text/plain'
53
+ # # => 'text/html'
54
+ #
55
+ # Returns a mime type String.
56
+ def mime_type
57
+ _mime_type ? _mime_type.to_s : 'text/plain'
58
+ end
59
+
60
+ # Internal: Is the blob binary according to its mime type
61
+ #
62
+ # Return true or false
63
+ def binary_mime_type?
64
+ _mime_type ? _mime_type.binary? : false
65
+ end
66
+
67
+ # Internal: Is the blob binary according to its mime type,
68
+ # overriding it if we have better data from the languages.yml
69
+ # database.
70
+ #
71
+ # Return true or false
72
+ def likely_binary?
73
+ binary_mime_type? && !Language.find_by_filename(name)
74
+ end
75
+
76
+ # Public: Get the Content-Type header value
77
+ #
78
+ # This value is used when serving raw blobs.
79
+ #
80
+ # Examples
81
+ #
82
+ # # => 'text/plain; charset=utf-8'
83
+ # # => 'application/octet-stream'
84
+ #
85
+ # Returns a content type String.
86
+ def content_type
87
+ @content_type ||= (binary_mime_type? || binary?) ? mime_type :
88
+ (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
89
+ end
90
+
91
+ # Public: Get the Content-Disposition header value
92
+ #
93
+ # This value is used when serving raw blobs.
94
+ #
95
+ # # => "attachment; filename=file.tar"
96
+ # # => "inline"
97
+ #
98
+ # Returns a content disposition String.
99
+ def disposition
100
+ if text? || image?
101
+ 'inline'
102
+ elsif name.nil?
103
+ "attachment"
104
+ else
105
+ "attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
106
+ end
107
+ end
108
+
109
+ def encoding
110
+ if hash = detect_encoding
111
+ hash[:encoding]
112
+ end
113
+ end
114
+
115
+ # Try to guess the encoding
116
+ #
117
+ # Returns: a Hash, with :encoding, :confidence, :type
118
+ # this will return nil if an error occurred during detection or
119
+ # no valid encoding could be found
120
+ def detect_encoding
121
+ @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
122
+ end
123
+
124
+ # Public: Is the blob binary?
125
+ #
126
+ # Return true or false
127
+ def binary?
128
+ # Large blobs aren't even loaded into memory
129
+ if data.nil?
130
+ true
131
+
132
+ # Treat blank files as text
133
+ elsif data == ""
134
+ false
135
+
136
+ # Charlock doesn't know what to think
137
+ elsif encoding.nil?
138
+ true
139
+
140
+ # If Charlock says its binary
141
+ else
142
+ detect_encoding[:type] == :binary
143
+ end
144
+ end
145
+
146
+ # Public: Is the blob text?
147
+ #
148
+ # Return true or false
149
+ def text?
150
+ !binary?
151
+ end
152
+
153
+ # Public: Is the blob a supported image format?
154
+ #
155
+ # Return true or false
156
+ def image?
157
+ ['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
158
+ end
159
+
160
+ # Public: Is the blob a supported 3D model format?
161
+ #
162
+ # Return true or false
163
+ def solid?
164
+ extname.downcase == '.stl'
165
+ end
166
+
167
+ # Public: Is this blob a CSV file?
168
+ #
169
+ # Return true or false
170
+ def csv?
171
+ text? && extname.downcase == '.csv'
172
+ end
173
+
174
+ # Public: Is the blob a PDF?
175
+ #
176
+ # Return true or false
177
+ def pdf?
178
+ extname.downcase == '.pdf'
179
+ end
180
+
181
+ MEGABYTE = 1024 * 1024
182
+
183
+ # Public: Is the blob too big to load?
184
+ #
185
+ # Return true or false
186
+ def large?
187
+ size.to_i > MEGABYTE
188
+ end
189
+
190
+ # Public: Is the blob safe to colorize?
191
+ #
192
+ # We use Pygments for syntax highlighting blobs. Pygments
193
+ # can be too slow for very large blobs or for certain
194
+ # corner-case blobs.
195
+ #
196
+ # Return true or false
197
+ def safe_to_colorize?
198
+ !large? && text? && !high_ratio_of_long_lines?
199
+ end
200
+
201
+ # Internal: Does the blob have a ratio of long lines?
202
+ #
203
+ # These types of files are usually going to make Pygments.rb
204
+ # angry if we try to colorize them.
205
+ #
206
+ # Return true or false
207
+ def high_ratio_of_long_lines?
208
+ return false if loc == 0
209
+ size / loc > 5000
210
+ end
211
+
212
+ # Public: Is the blob viewable?
213
+ #
214
+ # Non-viewable blobs will just show a "View Raw" link
215
+ #
216
+ # Return true or false
217
+ def viewable?
218
+ !large? && text?
219
+ end
220
+
221
+ vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
222
+ VendoredRegexp = Regexp.new(vendored_paths.join('|'))
223
+
224
+ # Public: Is the blob in a vendored directory?
225
+ #
226
+ # Vendored files are ignored by language statistics.
227
+ #
228
+ # See "vendor.yml" for a list of vendored conventions that match
229
+ # this pattern.
230
+ #
231
+ # Return true or false
232
+ def vendored?
233
+ name =~ VendoredRegexp ? true : false
234
+ end
235
+
236
+ # Public: Get each line of data
237
+ #
238
+ # Requires Blob#data
239
+ #
240
+ # Returns an Array of lines
241
+ def lines
242
+ @lines ||=
243
+ if viewable? && data
244
+ data.split(/\r\n|\r|\n/, -1)
245
+ else
246
+ []
247
+ end
248
+ end
249
+
250
+ # Public: Get number of lines of code
251
+ #
252
+ # Requires Blob#data
253
+ #
254
+ # Returns Integer
255
+ def loc
256
+ lines.size
257
+ end
258
+
259
+ # Public: Get number of source lines of code
260
+ #
261
+ # Requires Blob#data
262
+ #
263
+ # Returns Integer
264
+ def sloc
265
+ lines.grep(/\S/).size
266
+ end
267
+
268
+ # Public: Is the blob a generated file?
269
+ #
270
+ # Generated source code is suppressed in diffs and is ignored by
271
+ # language statistics.
272
+ #
273
+ # May load Blob#data
274
+ #
275
+ # Return true or false
276
+ def generated?
277
+ @_generated ||= Generated.generated?(name, lambda { data })
278
+ end
279
+
280
+ # Public: Detects the Language of the blob.
281
+ #
282
+ # May load Blob#data
283
+ #
284
+ # Returns a Language or nil if none is detected
285
+ def language
286
+ return @language if defined? @language
287
+
288
+ if defined?(@data) && @data.is_a?(String)
289
+ data = @data
290
+ else
291
+ data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
292
+ end
293
+
294
+ @language = Language.detect(name.to_s, data, mode)
295
+ end
296
+
297
+ # Internal: Get the lexer of the blob.
298
+ #
299
+ # Returns a Lexer.
300
+ def lexer
301
+ language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
302
+ end
303
+
304
+ # Public: Highlight syntax of blob
305
+ #
306
+ # options - A Hash of options (defaults to {})
307
+ #
308
+ # Returns html String
309
+ def colorize(options = {})
310
+ return unless safe_to_colorize?
311
+ options[:options] ||= {}
312
+ options[:options][:encoding] ||= encoding
313
+ lexer.highlight(data, options)
314
+ end
315
+ end
316
+ end
@@ -0,0 +1,171 @@
1
+ require 'linguist/tokenizer'
2
+
3
+ module Linguist
4
+ # Language bayesian classifier.
5
+ class Classifier
6
+ # Public: Train classifier that data is a certain language.
7
+ #
8
+ # db - Hash classifier database object
9
+ # language - String language of data
10
+ # data - String contents of file
11
+ #
12
+ # Examples
13
+ #
14
+ # Classifier.train(db, 'Ruby', "def hello; end")
15
+ #
16
+ # Returns nothing.
17
+ #
18
+ # Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token,
19
+ # per-language. See also dump_all_tokens, below.
20
+ def self.train!(db, language, data)
21
+ tokens = Tokenizer.tokenize(data)
22
+
23
+ db['tokens_total'] ||= 0
24
+ db['languages_total'] ||= 0
25
+ db['tokens'] ||= {}
26
+ db['language_tokens'] ||= {}
27
+ db['languages'] ||= {}
28
+
29
+ tokens.each do |token|
30
+ db['tokens'][language] ||= {}
31
+ db['tokens'][language][token] ||= 0
32
+ db['tokens'][language][token] += 1
33
+ db['language_tokens'][language] ||= 0
34
+ db['language_tokens'][language] += 1
35
+ db['tokens_total'] += 1
36
+ end
37
+ db['languages'][language] ||= 0
38
+ db['languages'][language] += 1
39
+ db['languages_total'] += 1
40
+
41
+ nil
42
+ end
43
+
44
+ # Public: Guess language of data.
45
+ #
46
+ # db - Hash of classifier tokens database.
47
+ # data - Array of tokens or String data to analyze.
48
+ # languages - Array of language name Strings to restrict to.
49
+ #
50
+ # Examples
51
+ #
52
+ # Classifier.classify(db, "def hello; end")
53
+ # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
54
+ #
55
+ # Returns sorted Array of result pairs. Each pair contains the
56
+ # String language name and a Float score.
57
+ def self.classify(db, tokens, languages = nil)
58
+ languages ||= db['languages'].keys
59
+ new(db).classify(tokens, languages)
60
+ end
61
+
62
+ # Internal: Initialize a Classifier.
63
+ def initialize(db = {})
64
+ @tokens_total = db['tokens_total']
65
+ @languages_total = db['languages_total']
66
+ @tokens = db['tokens']
67
+ @language_tokens = db['language_tokens']
68
+ @languages = db['languages']
69
+ end
70
+
71
+ # Internal: Guess language of data
72
+ #
73
+ # data - Array of tokens or String data to analyze.
74
+ # languages - Array of language name Strings to restrict to.
75
+ #
76
+ # Returns sorted Array of result pairs. Each pair contains the
77
+ # String language name and a Float score.
78
+ def classify(tokens, languages)
79
+ return [] if tokens.nil?
80
+ tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
81
+
82
+ scores = {}
83
+ if verbosity >= 2
84
+ dump_all_tokens(tokens, languages)
85
+ end
86
+ languages.each do |language|
87
+ scores[language] = tokens_probability(tokens, language) +
88
+ language_probability(language)
89
+ if verbosity >= 1
90
+ printf "%10s = %10.3f + %7.3f = %10.3f\n",
91
+ language, tokens_probability(tokens, language), language_probability(language), scores[language]
92
+ end
93
+ end
94
+
95
+ scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
96
+ end
97
+
98
+ # Internal: Probably of set of tokens in a language occurring - P(D | C)
99
+ #
100
+ # tokens - Array of String tokens.
101
+ # language - Language to check.
102
+ #
103
+ # Returns Float between 0.0 and 1.0.
104
+ def tokens_probability(tokens, language)
105
+ tokens.inject(0.0) do |sum, token|
106
+ sum += Math.log(token_probability(token, language))
107
+ end
108
+ end
109
+
110
+ # Internal: Probably of token in language occurring - P(F | C)
111
+ #
112
+ # token - String token.
113
+ # language - Language to check.
114
+ #
115
+ # Returns Float between 0.0 and 1.0.
116
+ def token_probability(token, language)
117
+ if @tokens[language][token].to_f == 0.0
118
+ 1 / @tokens_total.to_f
119
+ else
120
+ @tokens[language][token].to_f / @language_tokens[language].to_f
121
+ end
122
+ end
123
+
124
+ # Internal: Probably of a language occurring - P(C)
125
+ #
126
+ # language - Language to check.
127
+ #
128
+ # Returns Float between 0.0 and 1.0.
129
+ def language_probability(language)
130
+ Math.log(@languages[language].to_f / @languages_total.to_f)
131
+ end
132
+
133
+ private
134
+ def verbosity
135
+ @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
136
+ end
137
+
138
+ # Internal: show a table of probabilities for each <token,language> pair.
139
+ #
140
+ # The number in each table entry is the number of "points" that each
141
+ # token contributes toward the belief that the file under test is a
142
+ # particular language. Points are additive.
143
+ #
144
+ # Points are the number of times a token appears in the file, times
145
+ # how much more likely (log of probability ratio) that token is to
146
+ # appear in one language vs. the least-likely language. Dashes
147
+ # indicate the least-likely language (and zero points) for each token.
148
+ def dump_all_tokens(tokens, languages)
149
+ maxlen = tokens.map { |tok| tok.size }.max
150
+
151
+ printf "%#{maxlen}s", ""
152
+ puts " #" + languages.map { |lang| sprintf("%10s", lang) }.join
153
+
154
+ tokmap = Hash.new(0)
155
+ tokens.each { |tok| tokmap[tok] += 1 }
156
+
157
+ tokmap.sort.each { |tok, count|
158
+ arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
159
+ min = arr.map { |a,b| b }.min
160
+ minlog = Math.log(min)
161
+ if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
162
+ printf "%#{maxlen}s%5d", tok, count
163
+
164
+ puts arr.map { |ent|
165
+ ent[1] == min ? " -" : sprintf("%10.3f", count * (Math.log(ent[1]) - minlog))
166
+ }.join
167
+ end
168
+ }
169
+ end
170
+ end
171
+ end