RubyGems - tongue - Versions diffs - 0.2.10.8 - Mend

tongue 0.2.10.8

Files changed (18) hide show

checksums.yaml +7 -0
data/bin/tongue +46 -0
data/lib/linguist.rb +6 -0
data/lib/linguist/blob_helper.rb +333 -0
data/lib/linguist/classifier.rb +171 -0
data/lib/linguist/file_blob.rb +58 -0
data/lib/linguist/generated.rb +241 -0
data/lib/linguist/heuristics.rb +38 -0
data/lib/linguist/language.rb +578 -0
data/lib/linguist/languages.yml +1901 -0
data/lib/linguist/md5.rb +38 -0
data/lib/linguist/popular.yml +29 -0
data/lib/linguist/repository.rb +95 -0
data/lib/linguist/samples.json +47115 -0
data/lib/linguist/samples.rb +149 -0
data/lib/linguist/tokenizer.rb +198 -0
data/lib/linguist/vendor.yml +167 -0
metadata +143 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 1a1363397afe6015c6036f01dfd10d6f5e225b9d
+  data.tar.gz: 330d083847d913947882a2afa55cb6bed4d8109d
+SHA512:
+  metadata.gz: 8abfb4aab7feec239471bf8ba1731c1052f624dea59ec0d939b8910bc167b3b92ddb01df515e65ca7874f0ce1b9dca376cb715bfda492ab516a1385d3536bc94
+  data.tar.gz: d4a6dfa37d2568b6695e2ad6a90db066b8a772ccb66c0c534ec42912e7c88eb3efff331dc1b5c1008405bd9926ff647a9fa285fcb9b7fc0a37c3917647f5b24a

data/bin/tongue ADDED Viewed

@@ -0,0 +1,46 @@
+#!/usr/bin/env ruby
+# tongue — detect language type for a file, or, given a directory, determine language breakdown
+#     usage: tongue <path>
+require 'linguist/file_blob'
+require 'linguist/repository'
+path = ARGV[0] || Dir.pwd
+if File.directory?(path)
+  repo = Linguist::Repository.from_directory(path)
+  repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
+    percentage = ((size / repo.size.to_f) * 100)
+    percentage = sprintf '%.2f' % percentage
+    puts "%-7s %s" % ["#{percentage}%", language]
+  end
+elsif File.file?(path)
+  blob = Linguist::FileBlob.new(path, Dir.pwd)
+  type = if blob.text?
+    'Text'
+  elsif blob.image?
+    'Image'
+  else
+    'Binary'
+  end
+  puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
+  puts "  type:      #{type}"
+  puts "  mime type: #{blob.mime_type}"
+  puts "  language:  #{blob.language}"
+  if blob.large?
+    puts "  blob is too large to be shown"
+  end
+  if blob.generated?
+    puts "  appears to be generated source code"
+  end
+  if blob.vendored?
+    puts "  appears to be a vendored file"
+  end
+else
+  abort "usage: tongue <path>"
+end

data/lib/linguist.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require 'linguist/blob_helper'
+require 'linguist/generated'
+require 'linguist/heuristics'
+require 'linguist/language'
+require 'linguist/repository'
+require 'linguist/samples'

data/lib/linguist/blob_helper.rb ADDED Viewed

@@ -0,0 +1,333 @@
+require 'linguist/generated'
+require 'linguist/language'
+# require 'charlock_holmes'
+# require 'escape_utils'
+# require 'mime/types'
+require 'pygments'
+require 'yaml'
+module Linguist
+  # DEPRECATED Avoid mixing into Blob classes. Prefer functional interfaces
+  # like `Language.detect` over `Blob#language`. Functions are much easier to
+  # cache and compose.
+  #
+  # Avoid adding additional bloat to this module.
+  #
+  # BlobHelper is a mixin for Blobish classes that respond to "name",
+  # "data" and "size" such as Grit::Blob.
+  module BlobHelper
+    # Public: Get the extname of the path
+    #
+    # Examples
+    #
+    #   blob(name='foo.rb').extname
+    #   # => '.rb'
+    #
+    # Returns a String
+    def extname
+      File.extname(name.to_s)
+    end
+    # Internal: Lookup mime type for extension.
+    #
+    # Returns a MIME::Type
+    def _mime_type
+      'text/plain'
+      # if defined? @_mime_type
+      #   @_mime_type
+      # else
+      #   guesses = ::MIME::Types.type_for(extname.to_s)
+      #
+      #   # Prefer text mime types over binary
+      #   @_mime_type = guesses.detect { |type| type.ascii? } ||
+      #     # Otherwise use the first guess
+      #     guesses.first
+      # end
+    end
+    # Public: Get the actual blob mime type
+    #
+    # Examples
+    #
+    #   # => 'text/plain'
+    #   # => 'text/html'
+    #
+    # Returns a mime type String.
+    def mime_type
+      'text/plain'
+    end
+    # Internal: Is the blob binary according to its mime type
+    #
+    # Return true or false
+    def binary_mime_type?
+      false
+      # _mime_type ? _mime_type.binary? : false
+    end
+    # Internal: Is the blob binary according to its mime type,
+    # overriding it if we have better data from the languages.yml
+    # database.
+    #
+    # Return true or false
+    def likely_binary?
+      false
+      # binary_mime_type? && !Language.find_by_filename(name)
+    end
+    # Public: Get the Content-Type header value
+    #
+    # This value is used when serving raw blobs.
+    #
+    # Examples
+    #
+    #   # => 'text/plain; charset=utf-8'
+    #   # => 'application/octet-stream'
+    #
+    # Returns a content type String.
+    def content_type
+      "text/plain"
+      # @content_type ||= (binary_mime_type? || binary?) ? mime_type :
+        # (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
+    end
+    # Public: Get the Content-Disposition header value
+    #
+    # This value is used when serving raw blobs.
+    #
+    #   # => "attachment; filename=file.tar"
+    #   # => "inline"
+    #
+    # Returns a content disposition String.
+    def disposition
+      if text? || image?
+        'inline'
+      elsif name.nil?
+        "attachment"
+      else
+        'attachment'
+        # "attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
+      end
+    end
+    def encoding
+      # if hash = detect_encoding
+      'UTF-8'
+      # end
+    end
+    # Try to guess the encoding
+    #
+    # Returns: a Hash, with :encoding, :confidence, :type
+    #          this will return nil if an error occurred during detection or
+    #          no valid encoding could be found
+    def detect_encoding
+      {:encoding => 'UTF-8', :confidence => 100, :type => :text}
+    end
+    # Public: Is the blob binary?
+    #
+    # Return true or false
+    def binary?
+      # Large blobs aren't even loaded into memory
+      if data.nil?
+        true
+      else
+        false
+      # end
+      # Treat blank files as text
+      # elsif data == ""
+        # false
+      # Charlock doesn't know what to think
+      # elsif encoding.nil?
+        # true
+      # If Charlock says its binary
+      # else
+        # detect_encoding[:type] == :binary
+      end
+    end
+    # Public: Is the blob text?
+    #
+    # Return true or false
+    def text?
+      true
+    end
+    # Public: Is the blob a supported image format?
+    #
+    # Return true or false
+    def image?
+      false
+      # ['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
+    end
+    # Public: Is the blob a supported 3D model format?
+    #
+    # Return true or false
+    def solid?
+      false
+      # extname.downcase == '.stl'
+    end
+    # Public: Is this blob a CSV file?
+    #
+    # Return true or false
+    def csv?
+      false
+      # text? && extname.downcase == '.csv'
+    end
+    # Public: Is the blob a PDF?
+    #
+    # Return true or false
+    def pdf?
+      false
+      # extname.downcase == '.pdf'
+    end
+    # MEGABYTE = 1024 * 1024
+    # Public: Is the blob too big to load?
+    #
+    # Return true or false
+    def large?
+      false
+      # size.to_i > MEGABYTE
+    end
+    # Public: Is the blob safe to colorize?
+    #
+    # We use Pygments for syntax highlighting blobs. Pygments
+    # can be too slow for very large blobs or for certain
+    # corner-case blobs.
+    #
+    # Return true or false
+    def safe_to_colorize?
+      true
+      # !large? && text? && !high_ratio_of_long_lines?
+    end
+    # Internal: Does the blob have a ratio of long lines?
+    #
+    # These types of files are usually going to make Pygments.rb
+    # angry if we try to colorize them.
+    #
+    # Return true or false
+    def high_ratio_of_long_lines?
+      false
+      # return false if loc == 0
+      # size / loc > 5000
+    end
+    # Public: Is the blob viewable?
+    #
+    # Non-viewable blobs will just show a "View Raw" link
+    #
+    # Return true or false
+    def viewable?
+      true
+      # !large? && text?
+    end
+    # vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
+    # VendoredRegexp = Regexp.new(vendored_paths.join('|'))
+    # Public: Is the blob in a vendored directory?
+    #
+    # Vendored files are ignored by language statistics.
+    #
+    # See "vendor.yml" for a list of vendored conventions that match
+    # this pattern.
+    #
+    # Return true or false
+    def vendored?
+      false
+      # name =~ VendoredRegexp ? true : false
+    end
+    # Public: Get each line of data
+    #
+    # Requires Blob#data
+    #
+    # Returns an Array of lines
+    def lines
+      @lines ||=
+        if viewable? && data && !data.nil? && !data == ''
+          data.split(/\r\n|\r|\n/, -1)
+        else
+          []
+        end
+    end
+    # Public: Get number of lines of code
+    #
+    # Requires Blob#data
+    #
+    # Returns Integer
+    def loc
+      lines.size
+    end
+    # Public: Get number of source lines of code
+    #
+    # Requires Blob#data
+    #
+    # Returns Integer
+    def sloc
+      lines.grep(/\S/).size
+    end
+    # Public: Is the blob a generated file?
+    #
+    # Generated source code is suppressed in diffs and is ignored by
+    # language statistics.
+    #
+    # May load Blob#data
+    #
+    # Return true or false
+    def generated?
+      false
+      # @_generated ||= Generated.generated?(name, lambda { data })
+    end
+    # Public: Detects the Language of the blob.
+    #
+    # May load Blob#data
+    #
+    # Returns a Language or nil if none is detected
+    def language
+      return @language if defined? @language
+      if defined?(@data) && @data.is_a?(String) && !data == '' && !data.nil?
+        data = @data
+      else
+        data = lambda { self.data }
+      end
+      @language = Language.detect(name.to_s, data, mode)
+    end
+    # Internal: Get the lexer of the blob.
+    #
+    # Returns a Lexer.
+    def lexer
+      language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
+    end
+    # Public: Highlight syntax of blob
+    #
+    # options - A Hash of options (defaults to {})
+    #
+    # Returns html String
+    def colorize(options = {})
+      return unless safe_to_colorize?
+      options[:options] ||= {}
+      options[:options][:encoding] ||= encoding
+      lexer.highlight(data, options)
+    end
+  end
+end

data/lib/linguist/classifier.rb ADDED Viewed

@@ -0,0 +1,171 @@
+require 'linguist/tokenizer'
+module Linguist
+  # Language bayesian classifier.
+  class Classifier
+    # Public: Train classifier that data is a certain language.
+    #
+    # db       - Hash classifier database object
+    # language - String language of data
+    # data     - String contents of file
+    #
+    # Examples
+    #
+    #   Classifier.train(db, 'Ruby', "def hello; end")
+    #
+    # Returns nothing.
+    #
+    # Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token or
+    # per-language.  See also #dump_all_tokens, below.
+    def self.train!(db, language, data)
+      tokens = Tokenizer.tokenize(data)
+      db['tokens_total'] ||= 0
+      db['languages_total'] ||= 0
+      db['tokens'] ||= {}
+      db['language_tokens'] ||= {}
+      db['languages'] ||= {}
+      tokens.each do |token|
+        db['tokens'][language] ||= {}
+        db['tokens'][language][token] ||= 0
+        db['tokens'][language][token] += 1
+        db['language_tokens'][language] ||= 0
+        db['language_tokens'][language] += 1
+        db['tokens_total'] += 1
+      end
+      db['languages'][language] ||= 0
+      db['languages'][language] += 1
+      db['languages_total'] += 1
+      nil
+    end
+    # Public: Guess language of data.
+    #
+    # db        - Hash of classifier tokens database.
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of language name Strings to restrict to.
+    #
+    # Examples
+    #
+    #   Classifier.classify(db, "def hello; end")
+    #   # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
+    #
+    # Returns sorted Array of result pairs. Each pair contains the
+    # String language name and a Float score.
+    def self.classify(db, tokens, languages = nil)
+      languages ||= db['languages'].keys
+      new(db).classify(tokens, languages)
+    end
+    # Internal: Initialize a Classifier.
+    def initialize(db = {})
+      @tokens_total    = db['tokens_total']
+      @languages_total = db['languages_total']
+      @tokens          = db['tokens']
+      @language_tokens = db['language_tokens']
+      @languages       = db['languages']
+    end
+    # Internal: Guess language of data
+    #
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of language name Strings to restrict to.
+    #
+    # Returns sorted Array of result pairs. Each pair contains the
+    # String language name and a Float score.
+    def classify(tokens, languages)
+      return [] if tokens.nil?
+      tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
+      scores = {}
+      debug_dump_all_tokens(tokens, languages) if verbosity >= 2
+      languages.each do |language|
+        debug_dump_probabilities(tokens, language) if verbosity >= 1
+        scores[language] = tokens_probability(tokens, language) + language_probability(language)
+      end
+      scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
+    end
+    # Internal: Probably of set of tokens in a language occurring - P(D | C)
+    #
+    # tokens   - Array of String tokens.
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def tokens_probability(tokens, language)
+      tokens.inject(0.0) do |sum, token|
+        sum += Math.log(token_probability(token, language))
+      end
+    end
+    # Internal: Probably of token in language occurring - P(F | C)
+    #
+    # token    - String token.
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def token_probability(token, language)
+      if @tokens[language][token].to_f == 0.0
+        1 / @tokens_total.to_f
+      else
+        @tokens[language][token].to_f / @language_tokens[language].to_f
+      end
+    end
+    # Internal: Probably of a language occurring - P(C)
+    #
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def language_probability(language)
+      Math.log(@languages[language].to_f / @languages_total.to_f)
+    end
+    private
+      def verbosity
+        @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
+      end
+      def debug_dump_probabilities(tokens, language)
+        printf("%10s = %10.3f + %7.3f = %10.3f\n",
+            language, tokens_probability(tokens, language), language_probability(language), scores[language])
+      end
+      # Internal: show a table of probabilities for each <token,language> pair.
+      #
+      # The number in each table entry is the number of "points" that each
+      # token contributes toward the belief that the file under test is a
+      # particular language.  Points are additive.
+      #
+      # Points are the number of times a token appears in the file, times
+      # how much more likely (log of probability ratio) that token is to
+      # appear in one language vs. the least-likely language.  Dashes
+      # indicate the least-likely language (and zero points) for each token.
+      def debug_dump_all_tokens(tokens, languages)
+        maxlen = tokens.map { |tok| tok.size }.max
+        printf "%#{maxlen}s", ""
+        puts "    #" + languages.map { |lang| sprintf("%10s", lang) }.join
+        token_map = Hash.new(0)
+        tokens.each { |tok| token_map[tok] += 1 }
+        token_map.sort.each { |tok, count|
+          arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
+          min = arr.map { |a,b| b }.min
+          minlog = Math.log(min)
+          if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
+            printf "%#{maxlen}s%5d", tok, count
+            puts arr.map { |ent|
+              ent[1] == min ? "         -" : sprintf("%10.3f", count * (Math.log(ent[1]) - minlog))
+            }.join
+          end
+        }
+      end
+  end
+end