tongue 0.2.10.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1a1363397afe6015c6036f01dfd10d6f5e225b9d
4
+ data.tar.gz: 330d083847d913947882a2afa55cb6bed4d8109d
5
+ SHA512:
6
+ metadata.gz: 8abfb4aab7feec239471bf8ba1731c1052f624dea59ec0d939b8910bc167b3b92ddb01df515e65ca7874f0ce1b9dca376cb715bfda492ab516a1385d3536bc94
7
+ data.tar.gz: d4a6dfa37d2568b6695e2ad6a90db066b8a772ccb66c0c534ec42912e7c88eb3efff331dc1b5c1008405bd9926ff647a9fa285fcb9b7fc0a37c3917647f5b24a
data/bin/tongue ADDED
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # tongue — detect language type for a file, or, given a directory, determine language breakdown
4
+ # usage: tongue <path>
5
+
6
+ require 'linguist/file_blob'
7
+ require 'linguist/repository'
8
+
9
+ path = ARGV[0] || Dir.pwd
10
+
11
+ if File.directory?(path)
12
+ repo = Linguist::Repository.from_directory(path)
13
+ repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
14
+ percentage = ((size / repo.size.to_f) * 100)
15
+ percentage = sprintf '%.2f' % percentage
16
+ puts "%-7s %s" % ["#{percentage}%", language]
17
+ end
18
+ elsif File.file?(path)
19
+ blob = Linguist::FileBlob.new(path, Dir.pwd)
20
+ type = if blob.text?
21
+ 'Text'
22
+ elsif blob.image?
23
+ 'Image'
24
+ else
25
+ 'Binary'
26
+ end
27
+
28
+ puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
29
+ puts " type: #{type}"
30
+ puts " mime type: #{blob.mime_type}"
31
+ puts " language: #{blob.language}"
32
+
33
+ if blob.large?
34
+ puts " blob is too large to be shown"
35
+ end
36
+
37
+ if blob.generated?
38
+ puts " appears to be generated source code"
39
+ end
40
+
41
+ if blob.vendored?
42
+ puts " appears to be a vendored file"
43
+ end
44
+ else
45
+ abort "usage: tongue <path>"
46
+ end
data/lib/linguist.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'linguist/blob_helper'
2
+ require 'linguist/generated'
3
+ require 'linguist/heuristics'
4
+ require 'linguist/language'
5
+ require 'linguist/repository'
6
+ require 'linguist/samples'
@@ -0,0 +1,333 @@
1
+ require 'linguist/generated'
2
+ require 'linguist/language'
3
+
4
+ # require 'charlock_holmes'
5
+ # require 'escape_utils'
6
+ # require 'mime/types'
7
+ require 'pygments'
8
+ require 'yaml'
9
+
10
+ module Linguist
11
+ # DEPRECATED Avoid mixing into Blob classes. Prefer functional interfaces
12
+ # like `Language.detect` over `Blob#language`. Functions are much easier to
13
+ # cache and compose.
14
+ #
15
+ # Avoid adding additional bloat to this module.
16
+ #
17
+ # BlobHelper is a mixin for Blobish classes that respond to "name",
18
+ # "data" and "size" such as Grit::Blob.
19
+ module BlobHelper
20
+ # Public: Get the extname of the path
21
+ #
22
+ # Examples
23
+ #
24
+ # blob(name='foo.rb').extname
25
+ # # => '.rb'
26
+ #
27
+ # Returns a String
28
+ def extname
29
+ File.extname(name.to_s)
30
+ end
31
+
32
+ # Internal: Lookup mime type for extension.
33
+ #
34
+ # Returns a MIME::Type
35
+ def _mime_type
36
+ 'text/plain'
37
+ # if defined? @_mime_type
38
+ # @_mime_type
39
+ # else
40
+ # guesses = ::MIME::Types.type_for(extname.to_s)
41
+ #
42
+ # # Prefer text mime types over binary
43
+ # @_mime_type = guesses.detect { |type| type.ascii? } ||
44
+ # # Otherwise use the first guess
45
+ # guesses.first
46
+ # end
47
+ end
48
+
49
+ # Public: Get the actual blob mime type
50
+ #
51
+ # Examples
52
+ #
53
+ # # => 'text/plain'
54
+ # # => 'text/html'
55
+ #
56
+ # Returns a mime type String.
57
+ def mime_type
58
+ 'text/plain'
59
+ end
60
+
61
+ # Internal: Is the blob binary according to its mime type
62
+ #
63
+ # Return true or false
64
+ def binary_mime_type?
65
+ false
66
+ # _mime_type ? _mime_type.binary? : false
67
+ end
68
+
69
+ # Internal: Is the blob binary according to its mime type,
70
+ # overriding it if we have better data from the languages.yml
71
+ # database.
72
+ #
73
+ # Return true or false
74
+ def likely_binary?
75
+ false
76
+ # binary_mime_type? && !Language.find_by_filename(name)
77
+ end
78
+
79
+ # Public: Get the Content-Type header value
80
+ #
81
+ # This value is used when serving raw blobs.
82
+ #
83
+ # Examples
84
+ #
85
+ # # => 'text/plain; charset=utf-8'
86
+ # # => 'application/octet-stream'
87
+ #
88
+ # Returns a content type String.
89
+ def content_type
90
+ "text/plain"
91
+ # @content_type ||= (binary_mime_type? || binary?) ? mime_type :
92
+ # (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
93
+ end
94
+
95
+ # Public: Get the Content-Disposition header value
96
+ #
97
+ # This value is used when serving raw blobs.
98
+ #
99
+ # # => "attachment; filename=file.tar"
100
+ # # => "inline"
101
+ #
102
+ # Returns a content disposition String.
103
+ def disposition
104
+ if text? || image?
105
+ 'inline'
106
+ elsif name.nil?
107
+ "attachment"
108
+ else
109
+ 'attachment'
110
+ # "attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
111
+ end
112
+ end
113
+
114
+ def encoding
115
+ # if hash = detect_encoding
116
+ 'UTF-8'
117
+ # end
118
+ end
119
+
120
+ # Try to guess the encoding
121
+ #
122
+ # Returns: a Hash, with :encoding, :confidence, :type
123
+ # this will return nil if an error occurred during detection or
124
+ # no valid encoding could be found
125
+ def detect_encoding
126
+ {:encoding => 'UTF-8', :confidence => 100, :type => :text}
127
+ end
128
+
129
+ # Public: Is the blob binary?
130
+ #
131
+ # Return true or false
132
+ def binary?
133
+ # Large blobs aren't even loaded into memory
134
+ if data.nil?
135
+ true
136
+ else
137
+ false
138
+ # end
139
+ # Treat blank files as text
140
+ # elsif data == ""
141
+ # false
142
+
143
+ # Charlock doesn't know what to think
144
+ # elsif encoding.nil?
145
+ # true
146
+
147
+ # If Charlock says its binary
148
+ # else
149
+ # detect_encoding[:type] == :binary
150
+ end
151
+ end
152
+
153
+ # Public: Is the blob text?
154
+ #
155
+ # Return true or false
156
+ def text?
157
+ true
158
+ end
159
+
160
+ # Public: Is the blob a supported image format?
161
+ #
162
+ # Return true or false
163
+ def image?
164
+ false
165
+ # ['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
166
+ end
167
+
168
+ # Public: Is the blob a supported 3D model format?
169
+ #
170
+ # Return true or false
171
+ def solid?
172
+ false
173
+ # extname.downcase == '.stl'
174
+ end
175
+
176
+ # Public: Is this blob a CSV file?
177
+ #
178
+ # Return true or false
179
+ def csv?
180
+ false
181
+ # text? && extname.downcase == '.csv'
182
+ end
183
+
184
+ # Public: Is the blob a PDF?
185
+ #
186
+ # Return true or false
187
+ def pdf?
188
+ false
189
+ # extname.downcase == '.pdf'
190
+ end
191
+
192
+ # MEGABYTE = 1024 * 1024
193
+
194
+ # Public: Is the blob too big to load?
195
+ #
196
+ # Return true or false
197
+ def large?
198
+ false
199
+ # size.to_i > MEGABYTE
200
+ end
201
+
202
+ # Public: Is the blob safe to colorize?
203
+ #
204
+ # We use Pygments for syntax highlighting blobs. Pygments
205
+ # can be too slow for very large blobs or for certain
206
+ # corner-case blobs.
207
+ #
208
+ # Return true or false
209
+ def safe_to_colorize?
210
+ true
211
+ # !large? && text? && !high_ratio_of_long_lines?
212
+ end
213
+
214
+ # Internal: Does the blob have a ratio of long lines?
215
+ #
216
+ # These types of files are usually going to make Pygments.rb
217
+ # angry if we try to colorize them.
218
+ #
219
+ # Return true or false
220
+ def high_ratio_of_long_lines?
221
+ false
222
+ # return false if loc == 0
223
+ # size / loc > 5000
224
+ end
225
+
226
+ # Public: Is the blob viewable?
227
+ #
228
+ # Non-viewable blobs will just show a "View Raw" link
229
+ #
230
+ # Return true or false
231
+ def viewable?
232
+ true
233
+ # !large? && text?
234
+ end
235
+
236
+ # vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
237
+ # VendoredRegexp = Regexp.new(vendored_paths.join('|'))
238
+
239
+ # Public: Is the blob in a vendored directory?
240
+ #
241
+ # Vendored files are ignored by language statistics.
242
+ #
243
+ # See "vendor.yml" for a list of vendored conventions that match
244
+ # this pattern.
245
+ #
246
+ # Return true or false
247
+ def vendored?
248
+ false
249
+ # name =~ VendoredRegexp ? true : false
250
+ end
251
+
252
+ # Public: Get each line of data
253
+ #
254
+ # Requires Blob#data
255
+ #
256
+ # Returns an Array of lines
257
+ def lines
258
+ @lines ||=
259
+ if viewable? && data && !data.nil? && !data == ''
260
+ data.split(/\r\n|\r|\n/, -1)
261
+ else
262
+ []
263
+ end
264
+ end
265
+
266
+ # Public: Get number of lines of code
267
+ #
268
+ # Requires Blob#data
269
+ #
270
+ # Returns Integer
271
+ def loc
272
+ lines.size
273
+ end
274
+
275
+ # Public: Get number of source lines of code
276
+ #
277
+ # Requires Blob#data
278
+ #
279
+ # Returns Integer
280
+ def sloc
281
+ lines.grep(/\S/).size
282
+ end
283
+
284
+ # Public: Is the blob a generated file?
285
+ #
286
+ # Generated source code is suppressed in diffs and is ignored by
287
+ # language statistics.
288
+ #
289
+ # May load Blob#data
290
+ #
291
+ # Return true or false
292
+ def generated?
293
+ false
294
+ # @_generated ||= Generated.generated?(name, lambda { data })
295
+ end
296
+
297
+ # Public: Detects the Language of the blob.
298
+ #
299
+ # May load Blob#data
300
+ #
301
+ # Returns a Language or nil if none is detected
302
+ def language
303
+ return @language if defined? @language
304
+
305
+ if defined?(@data) && @data.is_a?(String) && !data == '' && !data.nil?
306
+ data = @data
307
+ else
308
+ data = lambda { self.data }
309
+ end
310
+
311
+ @language = Language.detect(name.to_s, data, mode)
312
+ end
313
+
314
+ # Internal: Get the lexer of the blob.
315
+ #
316
+ # Returns a Lexer.
317
+ def lexer
318
+ language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
319
+ end
320
+
321
+ # Public: Highlight syntax of blob
322
+ #
323
+ # options - A Hash of options (defaults to {})
324
+ #
325
+ # Returns html String
326
+ def colorize(options = {})
327
+ return unless safe_to_colorize?
328
+ options[:options] ||= {}
329
+ options[:options][:encoding] ||= encoding
330
+ lexer.highlight(data, options)
331
+ end
332
+ end
333
+ end
@@ -0,0 +1,171 @@
1
+ require 'linguist/tokenizer'
2
+
3
+ module Linguist
4
+ # Language bayesian classifier.
5
+ class Classifier
6
+ # Public: Train classifier that data is a certain language.
7
+ #
8
+ # db - Hash classifier database object
9
+ # language - String language of data
10
+ # data - String contents of file
11
+ #
12
+ # Examples
13
+ #
14
+ # Classifier.train(db, 'Ruby', "def hello; end")
15
+ #
16
+ # Returns nothing.
17
+ #
18
+ # Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token or
19
+ # per-language. See also #dump_all_tokens, below.
20
+ def self.train!(db, language, data)
21
+ tokens = Tokenizer.tokenize(data)
22
+
23
+ db['tokens_total'] ||= 0
24
+ db['languages_total'] ||= 0
25
+ db['tokens'] ||= {}
26
+ db['language_tokens'] ||= {}
27
+ db['languages'] ||= {}
28
+
29
+ tokens.each do |token|
30
+ db['tokens'][language] ||= {}
31
+ db['tokens'][language][token] ||= 0
32
+ db['tokens'][language][token] += 1
33
+ db['language_tokens'][language] ||= 0
34
+ db['language_tokens'][language] += 1
35
+ db['tokens_total'] += 1
36
+ end
37
+ db['languages'][language] ||= 0
38
+ db['languages'][language] += 1
39
+ db['languages_total'] += 1
40
+
41
+ nil
42
+ end
43
+
44
+ # Public: Guess language of data.
45
+ #
46
+ # db - Hash of classifier tokens database.
47
+ # data - Array of tokens or String data to analyze.
48
+ # languages - Array of language name Strings to restrict to.
49
+ #
50
+ # Examples
51
+ #
52
+ # Classifier.classify(db, "def hello; end")
53
+ # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
54
+ #
55
+ # Returns sorted Array of result pairs. Each pair contains the
56
+ # String language name and a Float score.
57
+ def self.classify(db, tokens, languages = nil)
58
+ languages ||= db['languages'].keys
59
+ new(db).classify(tokens, languages)
60
+ end
61
+
62
+ # Internal: Initialize a Classifier.
63
+ def initialize(db = {})
64
+ @tokens_total = db['tokens_total']
65
+ @languages_total = db['languages_total']
66
+ @tokens = db['tokens']
67
+ @language_tokens = db['language_tokens']
68
+ @languages = db['languages']
69
+ end
70
+
71
+ # Internal: Guess language of data
72
+ #
73
+ # data - Array of tokens or String data to analyze.
74
+ # languages - Array of language name Strings to restrict to.
75
+ #
76
+ # Returns sorted Array of result pairs. Each pair contains the
77
+ # String language name and a Float score.
78
+ def classify(tokens, languages)
79
+ return [] if tokens.nil?
80
+ tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
81
+ scores = {}
82
+
83
+ debug_dump_all_tokens(tokens, languages) if verbosity >= 2
84
+
85
+ languages.each do |language|
86
+ debug_dump_probabilities(tokens, language) if verbosity >= 1
87
+ scores[language] = tokens_probability(tokens, language) + language_probability(language)
88
+ end
89
+
90
+ scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
91
+ end
92
+
93
+ # Internal: Probably of set of tokens in a language occurring - P(D | C)
94
+ #
95
+ # tokens - Array of String tokens.
96
+ # language - Language to check.
97
+ #
98
+ # Returns Float between 0.0 and 1.0.
99
+ def tokens_probability(tokens, language)
100
+ tokens.inject(0.0) do |sum, token|
101
+ sum += Math.log(token_probability(token, language))
102
+ end
103
+ end
104
+
105
+ # Internal: Probably of token in language occurring - P(F | C)
106
+ #
107
+ # token - String token.
108
+ # language - Language to check.
109
+ #
110
+ # Returns Float between 0.0 and 1.0.
111
+ def token_probability(token, language)
112
+ if @tokens[language][token].to_f == 0.0
113
+ 1 / @tokens_total.to_f
114
+ else
115
+ @tokens[language][token].to_f / @language_tokens[language].to_f
116
+ end
117
+ end
118
+
119
+ # Internal: Probably of a language occurring - P(C)
120
+ #
121
+ # language - Language to check.
122
+ #
123
+ # Returns Float between 0.0 and 1.0.
124
+ def language_probability(language)
125
+ Math.log(@languages[language].to_f / @languages_total.to_f)
126
+ end
127
+
128
+ private
129
+ def verbosity
130
+ @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
131
+ end
132
+
133
+ def debug_dump_probabilities(tokens, language)
134
+ printf("%10s = %10.3f + %7.3f = %10.3f\n",
135
+ language, tokens_probability(tokens, language), language_probability(language), scores[language])
136
+ end
137
+
138
+ # Internal: show a table of probabilities for each <token,language> pair.
139
+ #
140
+ # The number in each table entry is the number of "points" that each
141
+ # token contributes toward the belief that the file under test is a
142
+ # particular language. Points are additive.
143
+ #
144
+ # Points are the number of times a token appears in the file, times
145
+ # how much more likely (log of probability ratio) that token is to
146
+ # appear in one language vs. the least-likely language. Dashes
147
+ # indicate the least-likely language (and zero points) for each token.
148
+ def debug_dump_all_tokens(tokens, languages)
149
+ maxlen = tokens.map { |tok| tok.size }.max
150
+
151
+ printf "%#{maxlen}s", ""
152
+ puts " #" + languages.map { |lang| sprintf("%10s", lang) }.join
153
+
154
+ token_map = Hash.new(0)
155
+ tokens.each { |tok| token_map[tok] += 1 }
156
+
157
+ token_map.sort.each { |tok, count|
158
+ arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
159
+ min = arr.map { |a,b| b }.min
160
+ minlog = Math.log(min)
161
+ if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
162
+ printf "%#{maxlen}s%5d", tok, count
163
+
164
+ puts arr.map { |ent|
165
+ ent[1] == min ? " -" : sprintf("%10.3f", count * (Math.log(ent[1]) - minlog))
166
+ }.join
167
+ end
168
+ }
169
+ end
170
+ end
171
+ end