tongue 0.2.10.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1a1363397afe6015c6036f01dfd10d6f5e225b9d
4
+ data.tar.gz: 330d083847d913947882a2afa55cb6bed4d8109d
5
+ SHA512:
6
+ metadata.gz: 8abfb4aab7feec239471bf8ba1731c1052f624dea59ec0d939b8910bc167b3b92ddb01df515e65ca7874f0ce1b9dca376cb715bfda492ab516a1385d3536bc94
7
+ data.tar.gz: d4a6dfa37d2568b6695e2ad6a90db066b8a772ccb66c0c534ec42912e7c88eb3efff331dc1b5c1008405bd9926ff647a9fa285fcb9b7fc0a37c3917647f5b24a
data/bin/tongue ADDED
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # tongue — detect language type for a file, or, given a directory, determine language breakdown
4
+ # usage: tongue <path>
5
+
6
+ require 'linguist/file_blob'
7
+ require 'linguist/repository'
8
+
9
+ path = ARGV[0] || Dir.pwd
10
+
11
+ if File.directory?(path)
12
+ repo = Linguist::Repository.from_directory(path)
13
+ repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
14
+ percentage = ((size / repo.size.to_f) * 100)
15
+ percentage = sprintf '%.2f' % percentage
16
+ puts "%-7s %s" % ["#{percentage}%", language]
17
+ end
18
+ elsif File.file?(path)
19
+ blob = Linguist::FileBlob.new(path, Dir.pwd)
20
+ type = if blob.text?
21
+ 'Text'
22
+ elsif blob.image?
23
+ 'Image'
24
+ else
25
+ 'Binary'
26
+ end
27
+
28
+ puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
29
+ puts " type: #{type}"
30
+ puts " mime type: #{blob.mime_type}"
31
+ puts " language: #{blob.language}"
32
+
33
+ if blob.large?
34
+ puts " blob is too large to be shown"
35
+ end
36
+
37
+ if blob.generated?
38
+ puts " appears to be generated source code"
39
+ end
40
+
41
+ if blob.vendored?
42
+ puts " appears to be a vendored file"
43
+ end
44
+ else
45
+ abort "usage: tongue <path>"
46
+ end
data/lib/linguist.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'linguist/blob_helper'
2
+ require 'linguist/generated'
3
+ require 'linguist/heuristics'
4
+ require 'linguist/language'
5
+ require 'linguist/repository'
6
+ require 'linguist/samples'
@@ -0,0 +1,333 @@
1
+ require 'linguist/generated'
2
+ require 'linguist/language'
3
+
4
+ # require 'charlock_holmes'
5
+ # require 'escape_utils'
6
+ # require 'mime/types'
7
+ require 'pygments'
8
+ require 'yaml'
9
+
10
+ module Linguist
11
+ # DEPRECATED Avoid mixing into Blob classes. Prefer functional interfaces
12
+ # like `Language.detect` over `Blob#language`. Functions are much easier to
13
+ # cache and compose.
14
+ #
15
+ # Avoid adding additional bloat to this module.
16
+ #
17
+ # BlobHelper is a mixin for Blobish classes that respond to "name",
18
+ # "data" and "size" such as Grit::Blob.
19
+ module BlobHelper
20
+ # Public: Get the extname of the path
21
+ #
22
+ # Examples
23
+ #
24
+ # blob(name='foo.rb').extname
25
+ # # => '.rb'
26
+ #
27
+ # Returns a String
28
+ def extname
29
+ File.extname(name.to_s)
30
+ end
31
+
32
+ # Internal: Lookup mime type for extension.
33
+ #
34
+ # Returns a MIME::Type
35
+ def _mime_type
36
+ 'text/plain'
37
+ # if defined? @_mime_type
38
+ # @_mime_type
39
+ # else
40
+ # guesses = ::MIME::Types.type_for(extname.to_s)
41
+ #
42
+ # # Prefer text mime types over binary
43
+ # @_mime_type = guesses.detect { |type| type.ascii? } ||
44
+ # # Otherwise use the first guess
45
+ # guesses.first
46
+ # end
47
+ end
48
+
49
+ # Public: Get the actual blob mime type
50
+ #
51
+ # Examples
52
+ #
53
+ # # => 'text/plain'
54
+ # # => 'text/html'
55
+ #
56
+ # Returns a mime type String.
57
+ def mime_type
58
+ 'text/plain'
59
+ end
60
+
61
+ # Internal: Is the blob binary according to its mime type
62
+ #
63
+ # Return true or false
64
+ def binary_mime_type?
65
+ false
66
+ # _mime_type ? _mime_type.binary? : false
67
+ end
68
+
69
+ # Internal: Is the blob binary according to its mime type,
70
+ # overriding it if we have better data from the languages.yml
71
+ # database.
72
+ #
73
+ # Return true or false
74
+ def likely_binary?
75
+ false
76
+ # binary_mime_type? && !Language.find_by_filename(name)
77
+ end
78
+
79
+ # Public: Get the Content-Type header value
80
+ #
81
+ # This value is used when serving raw blobs.
82
+ #
83
+ # Examples
84
+ #
85
+ # # => 'text/plain; charset=utf-8'
86
+ # # => 'application/octet-stream'
87
+ #
88
+ # Returns a content type String.
89
+ def content_type
90
+ "text/plain"
91
+ # @content_type ||= (binary_mime_type? || binary?) ? mime_type :
92
+ # (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
93
+ end
94
+
95
+ # Public: Get the Content-Disposition header value
96
+ #
97
+ # This value is used when serving raw blobs.
98
+ #
99
+ # # => "attachment; filename=file.tar"
100
+ # # => "inline"
101
+ #
102
+ # Returns a content disposition String.
103
+ def disposition
104
+ if text? || image?
105
+ 'inline'
106
+ elsif name.nil?
107
+ "attachment"
108
+ else
109
+ 'attachment'
110
+ # "attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
111
+ end
112
+ end
113
+
114
+ def encoding
115
+ # if hash = detect_encoding
116
+ 'UTF-8'
117
+ # end
118
+ end
119
+
120
+ # Try to guess the encoding
121
+ #
122
+ # Returns: a Hash, with :encoding, :confidence, :type
123
+ # this will return nil if an error occurred during detection or
124
+ # no valid encoding could be found
125
+ def detect_encoding
126
+ {:encoding => 'UTF-8', :confidence => 100, :type => :text}
127
+ end
128
+
129
+ # Public: Is the blob binary?
130
+ #
131
+ # Return true or false
132
+ def binary?
133
+ # Large blobs aren't even loaded into memory
134
+ if data.nil?
135
+ true
136
+ else
137
+ false
138
+ # end
139
+ # Treat blank files as text
140
+ # elsif data == ""
141
+ # false
142
+
143
+ # Charlock doesn't know what to think
144
+ # elsif encoding.nil?
145
+ # true
146
+
147
+ # If Charlock says its binary
148
+ # else
149
+ # detect_encoding[:type] == :binary
150
+ end
151
+ end
152
+
153
+ # Public: Is the blob text?
154
+ #
155
+ # Return true or false
156
+ def text?
157
+ true
158
+ end
159
+
160
+ # Public: Is the blob a supported image format?
161
+ #
162
+ # Return true or false
163
+ def image?
164
+ false
165
+ # ['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
166
+ end
167
+
168
+ # Public: Is the blob a supported 3D model format?
169
+ #
170
+ # Return true or false
171
+ def solid?
172
+ false
173
+ # extname.downcase == '.stl'
174
+ end
175
+
176
+ # Public: Is this blob a CSV file?
177
+ #
178
+ # Return true or false
179
+ def csv?
180
+ false
181
+ # text? && extname.downcase == '.csv'
182
+ end
183
+
184
+ # Public: Is the blob a PDF?
185
+ #
186
+ # Return true or false
187
+ def pdf?
188
+ false
189
+ # extname.downcase == '.pdf'
190
+ end
191
+
192
+ # MEGABYTE = 1024 * 1024
193
+
194
+ # Public: Is the blob too big to load?
195
+ #
196
+ # Return true or false
197
+ def large?
198
+ false
199
+ # size.to_i > MEGABYTE
200
+ end
201
+
202
+ # Public: Is the blob safe to colorize?
203
+ #
204
+ # We use Pygments for syntax highlighting blobs. Pygments
205
+ # can be too slow for very large blobs or for certain
206
+ # corner-case blobs.
207
+ #
208
+ # Return true or false
209
+ def safe_to_colorize?
210
+ true
211
+ # !large? && text? && !high_ratio_of_long_lines?
212
+ end
213
+
214
+ # Internal: Does the blob have a ratio of long lines?
215
+ #
216
+ # These types of files are usually going to make Pygments.rb
217
+ # angry if we try to colorize them.
218
+ #
219
+ # Return true or false
220
+ def high_ratio_of_long_lines?
221
+ false
222
+ # return false if loc == 0
223
+ # size / loc > 5000
224
+ end
225
+
226
+ # Public: Is the blob viewable?
227
+ #
228
+ # Non-viewable blobs will just show a "View Raw" link
229
+ #
230
+ # Return true or false
231
+ def viewable?
232
+ true
233
+ # !large? && text?
234
+ end
235
+
236
+ # vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
237
+ # VendoredRegexp = Regexp.new(vendored_paths.join('|'))
238
+
239
+ # Public: Is the blob in a vendored directory?
240
+ #
241
+ # Vendored files are ignored by language statistics.
242
+ #
243
+ # See "vendor.yml" for a list of vendored conventions that match
244
+ # this pattern.
245
+ #
246
+ # Return true or false
247
+ def vendored?
248
+ false
249
+ # name =~ VendoredRegexp ? true : false
250
+ end
251
+
252
+ # Public: Get each line of data
253
+ #
254
+ # Requires Blob#data
255
+ #
256
+ # Returns an Array of lines
257
+ def lines
258
+ @lines ||=
259
+ if viewable? && data && !data.nil? && !data == ''
260
+ data.split(/\r\n|\r|\n/, -1)
261
+ else
262
+ []
263
+ end
264
+ end
265
+
266
+ # Public: Get number of lines of code
267
+ #
268
+ # Requires Blob#data
269
+ #
270
+ # Returns Integer
271
+ def loc
272
+ lines.size
273
+ end
274
+
275
+ # Public: Get number of source lines of code
276
+ #
277
+ # Requires Blob#data
278
+ #
279
+ # Returns Integer
280
+ def sloc
281
+ lines.grep(/\S/).size
282
+ end
283
+
284
+ # Public: Is the blob a generated file?
285
+ #
286
+ # Generated source code is suppressed in diffs and is ignored by
287
+ # language statistics.
288
+ #
289
+ # May load Blob#data
290
+ #
291
+ # Return true or false
292
+ def generated?
293
+ false
294
+ # @_generated ||= Generated.generated?(name, lambda { data })
295
+ end
296
+
297
+ # Public: Detects the Language of the blob.
298
+ #
299
+ # May load Blob#data
300
+ #
301
+ # Returns a Language or nil if none is detected
302
+ def language
303
+ return @language if defined? @language
304
+
305
+ if defined?(@data) && @data.is_a?(String) && !data == '' && !data.nil?
306
+ data = @data
307
+ else
308
+ data = lambda { self.data }
309
+ end
310
+
311
+ @language = Language.detect(name.to_s, data, mode)
312
+ end
313
+
314
+ # Internal: Get the lexer of the blob.
315
+ #
316
+ # Returns a Lexer.
317
+ def lexer
318
+ language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
319
+ end
320
+
321
+ # Public: Highlight syntax of blob
322
+ #
323
+ # options - A Hash of options (defaults to {})
324
+ #
325
+ # Returns html String
326
+ def colorize(options = {})
327
+ return unless safe_to_colorize?
328
+ options[:options] ||= {}
329
+ options[:options][:encoding] ||= encoding
330
+ lexer.highlight(data, options)
331
+ end
332
+ end
333
+ end
@@ -0,0 +1,171 @@
1
+ require 'linguist/tokenizer'
2
+
3
+ module Linguist
4
+ # Language bayesian classifier.
5
+ class Classifier
6
+ # Public: Train classifier that data is a certain language.
7
+ #
8
+ # db - Hash classifier database object
9
+ # language - String language of data
10
+ # data - String contents of file
11
+ #
12
+ # Examples
13
+ #
14
+ # Classifier.train(db, 'Ruby', "def hello; end")
15
+ #
16
+ # Returns nothing.
17
+ #
18
+ # Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token or
19
+ # per-language. See also #dump_all_tokens, below.
20
+ def self.train!(db, language, data)
21
+ tokens = Tokenizer.tokenize(data)
22
+
23
+ db['tokens_total'] ||= 0
24
+ db['languages_total'] ||= 0
25
+ db['tokens'] ||= {}
26
+ db['language_tokens'] ||= {}
27
+ db['languages'] ||= {}
28
+
29
+ tokens.each do |token|
30
+ db['tokens'][language] ||= {}
31
+ db['tokens'][language][token] ||= 0
32
+ db['tokens'][language][token] += 1
33
+ db['language_tokens'][language] ||= 0
34
+ db['language_tokens'][language] += 1
35
+ db['tokens_total'] += 1
36
+ end
37
+ db['languages'][language] ||= 0
38
+ db['languages'][language] += 1
39
+ db['languages_total'] += 1
40
+
41
+ nil
42
+ end
43
+
44
+ # Public: Guess language of data.
45
+ #
46
+ # db - Hash of classifier tokens database.
47
+ # data - Array of tokens or String data to analyze.
48
+ # languages - Array of language name Strings to restrict to.
49
+ #
50
+ # Examples
51
+ #
52
+ # Classifier.classify(db, "def hello; end")
53
+ # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
54
+ #
55
+ # Returns sorted Array of result pairs. Each pair contains the
56
+ # String language name and a Float score.
57
+ def self.classify(db, tokens, languages = nil)
58
+ languages ||= db['languages'].keys
59
+ new(db).classify(tokens, languages)
60
+ end
61
+
62
+ # Internal: Initialize a Classifier.
63
+ def initialize(db = {})
64
+ @tokens_total = db['tokens_total']
65
+ @languages_total = db['languages_total']
66
+ @tokens = db['tokens']
67
+ @language_tokens = db['language_tokens']
68
+ @languages = db['languages']
69
+ end
70
+
71
+ # Internal: Guess language of data
72
+ #
73
+ # data - Array of tokens or String data to analyze.
74
+ # languages - Array of language name Strings to restrict to.
75
+ #
76
+ # Returns sorted Array of result pairs. Each pair contains the
77
+ # String language name and a Float score.
78
+ def classify(tokens, languages)
79
+ return [] if tokens.nil?
80
+ tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
81
+ scores = {}
82
+
83
+ debug_dump_all_tokens(tokens, languages) if verbosity >= 2
84
+
85
+ languages.each do |language|
86
+ debug_dump_probabilities(tokens, language) if verbosity >= 1
87
+ scores[language] = tokens_probability(tokens, language) + language_probability(language)
88
+ end
89
+
90
+ scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
91
+ end
92
+
93
+ # Internal: Probably of set of tokens in a language occurring - P(D | C)
94
+ #
95
+ # tokens - Array of String tokens.
96
+ # language - Language to check.
97
+ #
98
+ # Returns Float between 0.0 and 1.0.
99
+ def tokens_probability(tokens, language)
100
+ tokens.inject(0.0) do |sum, token|
101
+ sum += Math.log(token_probability(token, language))
102
+ end
103
+ end
104
+
105
+ # Internal: Probably of token in language occurring - P(F | C)
106
+ #
107
+ # token - String token.
108
+ # language - Language to check.
109
+ #
110
+ # Returns Float between 0.0 and 1.0.
111
+ def token_probability(token, language)
112
+ if @tokens[language][token].to_f == 0.0
113
+ 1 / @tokens_total.to_f
114
+ else
115
+ @tokens[language][token].to_f / @language_tokens[language].to_f
116
+ end
117
+ end
118
+
119
+ # Internal: Probably of a language occurring - P(C)
120
+ #
121
+ # language - Language to check.
122
+ #
123
+ # Returns Float between 0.0 and 1.0.
124
+ def language_probability(language)
125
+ Math.log(@languages[language].to_f / @languages_total.to_f)
126
+ end
127
+
128
+ private
129
+ def verbosity
130
+ @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
131
+ end
132
+
133
+ def debug_dump_probabilities(tokens, language)
134
+ printf("%10s = %10.3f + %7.3f = %10.3f\n",
135
+ language, tokens_probability(tokens, language), language_probability(language), scores[language])
136
+ end
137
+
138
+ # Internal: show a table of probabilities for each <token,language> pair.
139
+ #
140
+ # The number in each table entry is the number of "points" that each
141
+ # token contributes toward the belief that the file under test is a
142
+ # particular language. Points are additive.
143
+ #
144
+ # Points are the number of times a token appears in the file, times
145
+ # how much more likely (log of probability ratio) that token is to
146
+ # appear in one language vs. the least-likely language. Dashes
147
+ # indicate the least-likely language (and zero points) for each token.
148
+ def debug_dump_all_tokens(tokens, languages)
149
+ maxlen = tokens.map { |tok| tok.size }.max
150
+
151
+ printf "%#{maxlen}s", ""
152
+ puts " #" + languages.map { |lang| sprintf("%10s", lang) }.join
153
+
154
+ token_map = Hash.new(0)
155
+ tokens.each { |tok| token_map[tok] += 1 }
156
+
157
+ token_map.sort.each { |tok, count|
158
+ arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
159
+ min = arr.map { |a,b| b }.min
160
+ minlog = Math.log(min)
161
+ if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
162
+ printf "%#{maxlen}s%5d", tok, count
163
+
164
+ puts arr.map { |ent|
165
+ ent[1] == min ? " -" : sprintf("%10.3f", count * (Math.log(ent[1]) - minlog))
166
+ }.join
167
+ end
168
+ }
169
+ end
170
+ end
171
+ end