gitlab-linguist 2.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1f61bbc6a1106207f7c4791dc3c4bcd83600fa59
4
+ data.tar.gz: 1433a3391e6247ba26603ba25a7028f3fea9a45f
5
+ SHA512:
6
+ metadata.gz: 9902468506da9cc6e5a8ddf684001c7e117a0285717aedb625cea00c7ba19f6689131fadd8a10e19cbe02bc666fd1e02bc49afe75b8e37fd8ce95184b43e6e61
7
+ data.tar.gz: 18eb029e57495598de5b8c9b8d9d630b9160d3c0b2c8d0db4ac212924d0f6952726ee9134a0288ae8a4fec4be529899fac88fa68d39c319ac19a60a3864ee720
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # linguist — detect language type for a file, or, given a directory, determine language breakdown
4
+ #
5
+ # usage: linguist <path>
6
+
7
+ require 'linguist/file_blob'
8
+ require 'linguist/repository'
9
+
10
+ path = ARGV[0] || Dir.pwd
11
+
12
+ if File.directory?(path)
13
+ repo = Linguist::Repository.from_directory(path)
14
+ repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
15
+ percentage = ((size / repo.size.to_f) * 100).round
16
+ puts "%-4s %s" % ["#{percentage}%", language]
17
+ end
18
+ elsif File.file?(path)
19
+ blob = Linguist::FileBlob.new(path, Dir.pwd)
20
+ type = if blob.text?
21
+ 'Text'
22
+ elsif blob.image?
23
+ 'Image'
24
+ else
25
+ 'Binary'
26
+ end
27
+
28
+ puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
29
+ puts " type: #{type}"
30
+ puts " mime type: #{blob.mime_type}"
31
+ puts " language: #{blob.language}"
32
+
33
+ if blob.large?
34
+ puts " blob is too large to be shown"
35
+ end
36
+
37
+ if blob.generated?
38
+ puts " appears to be generated source code"
39
+ end
40
+
41
+ if blob.vendored?
42
+ puts " appears to be a vendored file"
43
+ end
44
+ else
45
+ abort "usage: linguist <path>"
46
+ end
@@ -0,0 +1,5 @@
1
+ require 'linguist/blob_helper'
2
+ require 'linguist/generated'
3
+ require 'linguist/language'
4
+ require 'linguist/repository'
5
+ require 'linguist/samples'
@@ -0,0 +1,316 @@
1
+ require 'linguist/generated'
2
+ require 'linguist/language'
3
+
4
+ require 'charlock_holmes'
5
+ require 'escape_utils'
6
+ require 'mime/types'
7
+ require 'pygments'
8
+ require 'yaml'
9
+
10
+ module Linguist
11
+ # DEPRECATED Avoid mixing into Blob classes. Prefer functional interfaces
12
+ # like `Language.detect` over `Blob#language`. Functions are much easier to
13
+ # cache and compose.
14
+ #
15
+ # Avoid adding additional bloat to this module.
16
+ #
17
+ # BlobHelper is a mixin for Blobish classes that respond to "name",
18
+ # "data" and "size" such as Grit::Blob.
19
+ module BlobHelper
20
+ # Public: Get the extname of the path
21
+ #
22
+ # Examples
23
+ #
24
+ # blob(name='foo.rb').extname
25
+ # # => '.rb'
26
+ #
27
+ # Returns a String
28
+ def extname
29
+ File.extname(name.to_s)
30
+ end
31
+
32
+ # Internal: Lookup mime type for extension.
33
+ #
34
+ # Returns a MIME::Type
35
+ def _mime_type
36
+ if defined? @_mime_type
37
+ @_mime_type
38
+ else
39
+ guesses = ::MIME::Types.type_for(extname.to_s)
40
+
41
+ # Prefer text mime types over binary
42
+ @_mime_type = guesses.detect { |type| type.ascii? } ||
43
+ # Otherwise use the first guess
44
+ guesses.first
45
+ end
46
+ end
47
+
48
+ # Public: Get the actual blob mime type
49
+ #
50
+ # Examples
51
+ #
52
+ # # => 'text/plain'
53
+ # # => 'text/html'
54
+ #
55
+ # Returns a mime type String.
56
+ def mime_type
57
+ _mime_type ? _mime_type.to_s : 'text/plain'
58
+ end
59
+
60
+ # Internal: Is the blob binary according to its mime type
61
+ #
62
+ # Return true or false
63
+ def binary_mime_type?
64
+ _mime_type ? _mime_type.binary? : false
65
+ end
66
+
67
+ # Internal: Is the blob binary according to its mime type,
68
+ # overriding it if we have better data from the languages.yml
69
+ # database.
70
+ #
71
+ # Return true or false
72
+ def likely_binary?
73
+ binary_mime_type? && !Language.find_by_filename(name)
74
+ end
75
+
76
+ # Public: Get the Content-Type header value
77
+ #
78
+ # This value is used when serving raw blobs.
79
+ #
80
+ # Examples
81
+ #
82
+ # # => 'text/plain; charset=utf-8'
83
+ # # => 'application/octet-stream'
84
+ #
85
+ # Returns a content type String.
86
+ def content_type
87
+ @content_type ||= (binary_mime_type? || binary?) ? mime_type :
88
+ (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
89
+ end
90
+
91
+ # Public: Get the Content-Disposition header value
92
+ #
93
+ # This value is used when serving raw blobs.
94
+ #
95
+ # # => "attachment; filename=file.tar"
96
+ # # => "inline"
97
+ #
98
+ # Returns a content disposition String.
99
+ def disposition
100
+ if text? || image?
101
+ 'inline'
102
+ elsif name.nil?
103
+ "attachment"
104
+ else
105
+ "attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
106
+ end
107
+ end
108
+
109
+ def encoding
110
+ if hash = detect_encoding
111
+ hash[:encoding]
112
+ end
113
+ end
114
+
115
+ # Try to guess the encoding
116
+ #
117
+ # Returns: a Hash, with :encoding, :confidence, :type
118
+ # this will return nil if an error occurred during detection or
119
+ # no valid encoding could be found
120
+ def detect_encoding
121
+ @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
122
+ end
123
+
124
+ # Public: Is the blob binary?
125
+ #
126
+ # Return true or false
127
+ def binary?
128
+ # Large blobs aren't even loaded into memory
129
+ if data.nil?
130
+ true
131
+
132
+ # Treat blank files as text
133
+ elsif data == ""
134
+ false
135
+
136
+ # Charlock doesn't know what to think
137
+ elsif encoding.nil?
138
+ true
139
+
140
+ # If Charlock says its binary
141
+ else
142
+ detect_encoding[:type] == :binary
143
+ end
144
+ end
145
+
146
+ # Public: Is the blob text?
147
+ #
148
+ # Return true or false
149
+ def text?
150
+ !binary?
151
+ end
152
+
153
+ # Public: Is the blob a supported image format?
154
+ #
155
+ # Return true or false
156
+ def image?
157
+ ['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
158
+ end
159
+
160
+ # Public: Is the blob a supported 3D model format?
161
+ #
162
+ # Return true or false
163
+ def solid?
164
+ extname.downcase == '.stl'
165
+ end
166
+
167
+ # Public: Is this blob a CSV file?
168
+ #
169
+ # Return true or false
170
+ def csv?
171
+ text? && extname.downcase == '.csv'
172
+ end
173
+
174
+ # Public: Is the blob a PDF?
175
+ #
176
+ # Return true or false
177
+ def pdf?
178
+ extname.downcase == '.pdf'
179
+ end
180
+
181
+ MEGABYTE = 1024 * 1024
182
+
183
+ # Public: Is the blob too big to load?
184
+ #
185
+ # Return true or false
186
+ def large?
187
+ size.to_i > MEGABYTE
188
+ end
189
+
190
+ # Public: Is the blob safe to colorize?
191
+ #
192
+ # We use Pygments for syntax highlighting blobs. Pygments
193
+ # can be too slow for very large blobs or for certain
194
+ # corner-case blobs.
195
+ #
196
+ # Return true or false
197
+ def safe_to_colorize?
198
+ !large? && text? && !high_ratio_of_long_lines?
199
+ end
200
+
201
+ # Internal: Does the blob have a ratio of long lines?
202
+ #
203
+ # These types of files are usually going to make Pygments.rb
204
+ # angry if we try to colorize them.
205
+ #
206
+ # Return true or false
207
+ def high_ratio_of_long_lines?
208
+ return false if loc == 0
209
+ size / loc > 5000
210
+ end
211
+
212
+ # Public: Is the blob viewable?
213
+ #
214
+ # Non-viewable blobs will just show a "View Raw" link
215
+ #
216
+ # Return true or false
217
+ def viewable?
218
+ !large? && text?
219
+ end
220
+
221
+ vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
222
+ VendoredRegexp = Regexp.new(vendored_paths.join('|'))
223
+
224
+ # Public: Is the blob in a vendored directory?
225
+ #
226
+ # Vendored files are ignored by language statistics.
227
+ #
228
+ # See "vendor.yml" for a list of vendored conventions that match
229
+ # this pattern.
230
+ #
231
+ # Return true or false
232
+ def vendored?
233
+ name =~ VendoredRegexp ? true : false
234
+ end
235
+
236
+ # Public: Get each line of data
237
+ #
238
+ # Requires Blob#data
239
+ #
240
+ # Returns an Array of lines
241
+ def lines
242
+ @lines ||=
243
+ if viewable? && data
244
+ data.split(/\r\n|\r|\n/, -1)
245
+ else
246
+ []
247
+ end
248
+ end
249
+
250
+ # Public: Get number of lines of code
251
+ #
252
+ # Requires Blob#data
253
+ #
254
+ # Returns Integer
255
+ def loc
256
+ lines.size
257
+ end
258
+
259
+ # Public: Get number of source lines of code
260
+ #
261
+ # Requires Blob#data
262
+ #
263
+ # Returns Integer
264
+ def sloc
265
+ lines.grep(/\S/).size
266
+ end
267
+
268
+ # Public: Is the blob a generated file?
269
+ #
270
+ # Generated source code is suppressed in diffs and is ignored by
271
+ # language statistics.
272
+ #
273
+ # May load Blob#data
274
+ #
275
+ # Return true or false
276
+ def generated?
277
+ @_generated ||= Generated.generated?(name, lambda { data })
278
+ end
279
+
280
+ # Public: Detects the Language of the blob.
281
+ #
282
+ # May load Blob#data
283
+ #
284
+ # Returns a Language or nil if none is detected
285
+ def language
286
+ return @language if defined? @language
287
+
288
+ if defined?(@data) && @data.is_a?(String)
289
+ data = @data
290
+ else
291
+ data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
292
+ end
293
+
294
+ @language = Language.detect(name.to_s, data, mode)
295
+ end
296
+
297
+ # Internal: Get the lexer of the blob.
298
+ #
299
+ # Returns a Lexer.
300
+ def lexer
301
+ language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
302
+ end
303
+
304
+ # Public: Highlight syntax of blob
305
+ #
306
+ # options - A Hash of options (defaults to {})
307
+ #
308
+ # Returns html String
309
+ def colorize(options = {})
310
+ return unless safe_to_colorize?
311
+ options[:options] ||= {}
312
+ options[:options][:encoding] ||= encoding
313
+ lexer.highlight(data, options)
314
+ end
315
+ end
316
+ end
@@ -0,0 +1,171 @@
1
+ require 'linguist/tokenizer'
2
+
3
+ module Linguist
4
+ # Language bayesian classifier.
5
+ class Classifier
6
+ # Public: Train classifier that data is a certain language.
7
+ #
8
+ # db - Hash classifier database object
9
+ # language - String language of data
10
+ # data - String contents of file
11
+ #
12
+ # Examples
13
+ #
14
+ # Classifier.train(db, 'Ruby', "def hello; end")
15
+ #
16
+ # Returns nothing.
17
+ #
18
+ # Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token,
19
+ # per-language. See also dump_all_tokens, below.
20
+ def self.train!(db, language, data)
21
+ tokens = Tokenizer.tokenize(data)
22
+
23
+ db['tokens_total'] ||= 0
24
+ db['languages_total'] ||= 0
25
+ db['tokens'] ||= {}
26
+ db['language_tokens'] ||= {}
27
+ db['languages'] ||= {}
28
+
29
+ tokens.each do |token|
30
+ db['tokens'][language] ||= {}
31
+ db['tokens'][language][token] ||= 0
32
+ db['tokens'][language][token] += 1
33
+ db['language_tokens'][language] ||= 0
34
+ db['language_tokens'][language] += 1
35
+ db['tokens_total'] += 1
36
+ end
37
+ db['languages'][language] ||= 0
38
+ db['languages'][language] += 1
39
+ db['languages_total'] += 1
40
+
41
+ nil
42
+ end
43
+
44
+ # Public: Guess language of data.
45
+ #
46
+ # db - Hash of classifier tokens database.
47
+ # data - Array of tokens or String data to analyze.
48
+ # languages - Array of language name Strings to restrict to.
49
+ #
50
+ # Examples
51
+ #
52
+ # Classifier.classify(db, "def hello; end")
53
+ # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
54
+ #
55
+ # Returns sorted Array of result pairs. Each pair contains the
56
+ # String language name and a Float score.
57
+ def self.classify(db, tokens, languages = nil)
58
+ languages ||= db['languages'].keys
59
+ new(db).classify(tokens, languages)
60
+ end
61
+
62
+ # Internal: Initialize a Classifier.
63
+ def initialize(db = {})
64
+ @tokens_total = db['tokens_total']
65
+ @languages_total = db['languages_total']
66
+ @tokens = db['tokens']
67
+ @language_tokens = db['language_tokens']
68
+ @languages = db['languages']
69
+ end
70
+
71
+ # Internal: Guess language of data
72
+ #
73
+ # data - Array of tokens or String data to analyze.
74
+ # languages - Array of language name Strings to restrict to.
75
+ #
76
+ # Returns sorted Array of result pairs. Each pair contains the
77
+ # String language name and a Float score.
78
+ def classify(tokens, languages)
79
+ return [] if tokens.nil?
80
+ tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
81
+
82
+ scores = {}
83
+ if verbosity >= 2
84
+ dump_all_tokens(tokens, languages)
85
+ end
86
+ languages.each do |language|
87
+ scores[language] = tokens_probability(tokens, language) +
88
+ language_probability(language)
89
+ if verbosity >= 1
90
+ printf "%10s = %10.3f + %7.3f = %10.3f\n",
91
+ language, tokens_probability(tokens, language), language_probability(language), scores[language]
92
+ end
93
+ end
94
+
95
+ scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
96
+ end
97
+
98
+ # Internal: Probably of set of tokens in a language occurring - P(D | C)
99
+ #
100
+ # tokens - Array of String tokens.
101
+ # language - Language to check.
102
+ #
103
+ # Returns Float between 0.0 and 1.0.
104
+ def tokens_probability(tokens, language)
105
+ tokens.inject(0.0) do |sum, token|
106
+ sum += Math.log(token_probability(token, language))
107
+ end
108
+ end
109
+
110
+ # Internal: Probably of token in language occurring - P(F | C)
111
+ #
112
+ # token - String token.
113
+ # language - Language to check.
114
+ #
115
+ # Returns Float between 0.0 and 1.0.
116
+ def token_probability(token, language)
117
+ if @tokens[language][token].to_f == 0.0
118
+ 1 / @tokens_total.to_f
119
+ else
120
+ @tokens[language][token].to_f / @language_tokens[language].to_f
121
+ end
122
+ end
123
+
124
+ # Internal: Probably of a language occurring - P(C)
125
+ #
126
+ # language - Language to check.
127
+ #
128
+ # Returns Float between 0.0 and 1.0.
129
+ def language_probability(language)
130
+ Math.log(@languages[language].to_f / @languages_total.to_f)
131
+ end
132
+
133
+ private
134
+ def verbosity
135
+ @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
136
+ end
137
+
138
+ # Internal: show a table of probabilities for each <token,language> pair.
139
+ #
140
+ # The number in each table entry is the number of "points" that each
141
+ # token contributes toward the belief that the file under test is a
142
+ # particular language. Points are additive.
143
+ #
144
+ # Points are the number of times a token appears in the file, times
145
+ # how much more likely (log of probability ratio) that token is to
146
+ # appear in one language vs. the least-likely language. Dashes
147
+ # indicate the least-likely language (and zero points) for each token.
148
+ def dump_all_tokens(tokens, languages)
149
+ maxlen = tokens.map { |tok| tok.size }.max
150
+
151
+ printf "%#{maxlen}s", ""
152
+ puts " #" + languages.map { |lang| sprintf("%10s", lang) }.join
153
+
154
+ tokmap = Hash.new(0)
155
+ tokens.each { |tok| tokmap[tok] += 1 }
156
+
157
+ tokmap.sort.each { |tok, count|
158
+ arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
159
+ min = arr.map { |a,b| b }.min
160
+ minlog = Math.log(min)
161
+ if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
162
+ printf "%#{maxlen}s%5d", tok, count
163
+
164
+ puts arr.map { |ent|
165
+ ent[1] == min ? " -" : sprintf("%10.3f", count * (Math.log(ent[1]) - minlog))
166
+ }.join
167
+ end
168
+ }
169
+ end
170
+ end
171
+ end