RubyGems - geothird-linguist - Versions diffs - 2.6.1 - Mend

geothird-linguist 2.6.1

Files changed (17) hide show

checksums.yaml +7 -0
data/bin/linguist +42 -0
data/lib/linguist.rb +5 -0
data/lib/linguist/blob_helper.rb +360 -0
data/lib/linguist/classifier.rb +123 -0
data/lib/linguist/file_blob.rb +56 -0
data/lib/linguist/generated.rb +175 -0
data/lib/linguist/language.rb +481 -0
data/lib/linguist/languages.yml +1403 -0
data/lib/linguist/md5.rb +38 -0
data/lib/linguist/popular.yml +29 -0
data/lib/linguist/repository.rb +95 -0
data/lib/linguist/samples.json +32050 -0
data/lib/linguist/samples.rb +98 -0
data/lib/linguist/tokenizer.rb +197 -0
data/lib/linguist/vendor.yml +106 -0
metadata +170 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 29e608de9f7d1f047fedc42252372a33c8f9af97
+  data.tar.gz: 184de1c9648189df496844f1b83299be88597ceb
+SHA512:
+  metadata.gz: 28e2e56c28062cbb43bd9b54bc522512caf479459a065db81fccbb14d11a6e74060bb695b85b5b04b05325863b6388187581e7fe44f2b8573d6d0faa90f6c8ba
+  data.tar.gz: f561dd836463b6ea186fdc98ace219e0d8901a209875091ac6279e23375c0ab86346a96458b08aa462ba021675ed0310e685ecb6293a57b2ab48e9aca7c0d90f

data/bin/linguist ADDED

@@ -0,0 +1,42 @@
+#!/usr/bin/env ruby
+require 'linguist/file_blob'
+require 'linguist/repository'
+path = ARGV[0] || Dir.pwd
+if File.directory?(path)
+  repo = Linguist::Repository.from_directory(path)
+  repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
+    percentage = ((size / repo.size.to_f) * 100).round
+    puts "%-4s %s" % ["#{percentage}%", language]
+  end
+elsif File.file?(path)
+  blob = Linguist::FileBlob.new(path, Dir.pwd)
+  type = if blob.text?
+    'Text'
+  elsif blob.image?
+    'Image'
+  else
+    'Binary'
+  end
+  puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
+  puts "  type:      #{type}"
+  puts "  mime type: #{blob.mime_type}"
+  puts "  language:  #{blob.language}"
+  if blob.large?
+    puts "  blob is too large to be shown"
+  end
+  if blob.generated?
+    puts "  appears to be generated source code"
+  end
+  if blob.vendored?
+    puts "  appears to be a vendored file"
+  end
+else
+  abort "usage: linguist <path>"
+end

data/lib/linguist.rb ADDED

@@ -0,0 +1,5 @@
+require 'linguist/blob_helper'
+require 'linguist/generated'
+require 'linguist/language'
+require 'linguist/repository'
+require 'linguist/samples'

data/lib/linguist/blob_helper.rb ADDED

@@ -0,0 +1,360 @@
+require 'linguist/generated'
+require 'linguist/language'
+require 'charlock_holmes'
+require 'escape_utils'
+require 'mime/types'
+require 'pygments'
+require 'yaml'
+module Linguist
+  # BlobHelper is a mixin for Blobish classes that respond to "name",
+  # "data" and "size" such as Grit::Blob.
+  module BlobHelper
+    # Public: Get the extname of the path
+    #
+    # Examples
+    #
+    #   blob(name='foo.rb').extname
+    #   # => '.rb'
+    #
+    # Returns a String
+    def extname
+      File.extname(name.to_s)
+    end
+    # Internal: Lookup mime type for extension.
+    #
+    # Returns a MIME::Type
+    def _mime_type
+      if defined? @_mime_type
+        @_mime_type
+      else
+        guesses = ::MIME::Types.type_for(extname.to_s)
+        # Prefer text mime types over binary
+        @_mime_type = guesses.detect { |type| type.ascii? } ||
+          # Otherwise use the first guess
+          guesses.first
+      end
+    end
+    # Public: Get the actual blob mime type
+    #
+    # Examples
+    #
+    #   # => 'text/plain'
+    #   # => 'text/html'
+    #
+    # Returns a mime type String.
+    def mime_type
+      _mime_type ? _mime_type.to_s : 'text/plain'
+    end
+    # Internal: Is the blob binary according to its mime type
+    #
+    # Return true or false
+    def binary_mime_type?
+      _mime_type ? _mime_type.binary? : false
+    end
+    # Internal: Is the blob binary according to its mime type,
+    # overriding it if we have better data from the languages.yml
+    # database.
+    #
+    # Return true or false
+    def likely_binary?
+       binary_mime_type? and not Language.find_by_filename(name)
+    end
+    # Public: Get the Content-Type header value
+    #
+    # This value is used when serving raw blobs.
+    #
+    # Examples
+    #
+    #   # => 'text/plain; charset=utf-8'
+    #   # => 'application/octet-stream'
+    #
+    # Returns a content type String.
+    def content_type
+      @content_type ||= (binary_mime_type? || binary?) ? mime_type :
+        (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
+    end
+    # Public: Get the Content-Disposition header value
+    #
+    # This value is used when serving raw blobs.
+    #
+    #   # => "attachment; filename=file.tar"
+    #   # => "inline"
+    #
+    # Returns a content disposition String.
+    def disposition
+      if text? || image?
+        'inline'
+      elsif name.nil?
+        "attachment"
+      else
+        "attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
+      end
+    end
+    def encoding
+      if hash = detect_encoding
+        hash[:encoding]
+      end
+    end
+    # Try to guess the encoding
+    #
+    # Returns: a Hash, with :encoding, :confidence, :type
+    #          this will return nil if an error occurred during detection or
+    #          no valid encoding could be found
+    def detect_encoding
+      @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
+    end
+    # Public: Is the blob binary?
+    #
+    # Return true or false
+    def binary?
+      # Large blobs aren't even loaded into memory
+      if data.nil?
+        true
+      # Treat blank files as text
+      elsif data == ""
+        false
+      # Charlock doesn't know what to think
+      elsif encoding.nil?
+        true
+      # If Charlock says its binary
+      else
+        detect_encoding[:type] == :binary
+      end
+    end
+    # Public: Is the blob text?
+    #
+    # Return true or false
+    def text?
+      !binary?
+    end
+    # Public: Is the blob a supported image format?
+    #
+    # Return true or false
+    def image?
+      ['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
+    end
+    # Public: Is the blob a support 3D model format?
+    #
+    # Return true or false
+    def solid?
+      ['.stl', '.obj'].include?(extname)
+    end
+    MEGABYTE = 1024 * 1024
+    # Public: Is the blob too big to load?
+    #
+    # Return true or false
+    def large?
+      size.to_i > MEGABYTE
+    end
+    # Public: Is the blob safe to colorize?
+    #
+    # We use Pygments.rb for syntax highlighting blobs, which
+    # has some quirks and also is essentially 'un-killable' via
+    # normal timeout.  To workaround this we try to
+    # carefully handling Pygments.rb anything it can't handle.
+    #
+    # Return true or false
+    def safe_to_colorize?
+      !large? && text? && !high_ratio_of_long_lines?
+    end
+    # Internal: Does the blob have a ratio of long lines?
+    #
+    # These types of files are usually going to make Pygments.rb
+    # angry if we try to colorize them.
+    #
+    # Return true or false
+    def high_ratio_of_long_lines?
+      return false if loc == 0
+      size / loc > 5000
+    end
+    # Public: Is the blob viewable?
+    #
+    # Non-viewable blobs will just show a "View Raw" link
+    #
+    # Return true or false
+    def viewable?
+      !large? && text?
+    end
+    vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
+    VendoredRegexp = Regexp.new(vendored_paths.join('|'))
+    # Public: Is the blob in a vendored directory?
+    #
+    # Vendored files are ignored by language statistics.
+    #
+    # See "vendor.yml" for a list of vendored conventions that match
+    # this pattern.
+    #
+    # Return true or false
+    def vendored?
+      name =~ VendoredRegexp ? true : false
+    end
+    # Public: Get each line of data
+    #
+    # Requires Blob#data
+    #
+    # Returns an Array of lines
+    def lines
+      @lines ||=
+        if viewable? && data
+          data.split(line_split_character, -1)
+        else
+          []
+        end
+    end
+    # Character used to split lines. This is almost always "\n" except when Mac
+    # Format is detected in which case it's "\r".
+    #
+    # Returns a split pattern string.
+    def line_split_character
+      @line_split_character ||= (mac_format?? "\r" : "\n")
+    end
+    # Public: Is the data in ** Mac Format **. This format uses \r (0x0d) characters
+    # for line ends and does not include a \n (0x0a).
+    #
+    # Returns true when mac format is detected.
+    def mac_format?
+      return if !viewable?
+      if pos = data[0, 4096].index("\r")
+        data[pos + 1] != ?\n
+      end
+    end
+    # Public: Get number of lines of code
+    #
+    # Requires Blob#data
+    #
+    # Returns Integer
+    def loc
+      lines.size
+    end
+    # Public: Get number of source lines of code
+    #
+    # Requires Blob#data
+    #
+    # Returns Integer
+    def sloc
+      lines.grep(/\S/).size
+    end
+    # Public: Is the blob a generated file?
+    #
+    # Generated source code is supressed in diffs and is ignored by
+    # language statistics.
+    #
+    # May load Blob#data
+    #
+    # Return true or false
+    def generated?
+      @_generated ||= Generated.generated?(name, lambda { data })
+    end
+    # Public: Should the blob be indexed for searching?
+    #
+    # Excluded:
+    # - Files over 0.1MB
+    # - Non-text files
+    # - Langauges marked as not searchable
+    # - Generated source files
+    #
+    # Please add additional test coverage to
+    # `test/test_blob.rb#test_indexable` if you make any changes.
+    #
+    # Return true or false
+    def indexable?
+      if size > 100 * 1024
+        false
+      elsif binary?
+        false
+      elsif extname == '.txt'
+        true
+      elsif language.nil?
+        false
+      elsif !language.searchable?
+        false
+      elsif generated?
+        false
+      else
+        true
+      end
+    end
+    # Public: Detects the Language of the blob.
+    #
+    # May load Blob#data
+    #
+    # Returns a Language or nil if none is detected
+    def language
+      return @language if defined? @language
+      if defined?(@data) && @data.is_a?(String)
+        data = @data
+      else
+        data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
+      end
+      @language = Language.detect(name.to_s, data, mode)
+    end
+    # Internal: Get the lexer of the blob.
+    #
+    # Returns a Lexer.
+    def lexer
+      language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
+    end
+    # Public: Highlight syntax of blob
+    #
+    # options - A Hash of options (defaults to {})
+    #
+    # Returns html String
+    def colorize(options = {})
+      return unless safe_to_colorize?
+      options[:options] ||= {}
+      options[:options][:encoding] ||= encoding
+      lexer.highlight(data, options)
+    end
+    # Public: Highlight syntax of blob without the outer highlight div
+    # wrapper.
+    #
+    # options - A Hash of options (defaults to {})
+    #
+    # Returns html String
+    def colorize_without_wrapper(options = {})
+      if text = colorize(options)
+        text[%r{<div class="highlight"><pre>(.*?)</pre>\s*</div>}m, 1]
+      else
+        ''
+      end
+    end
+  end
+end

data/lib/linguist/classifier.rb ADDED

@@ -0,0 +1,123 @@
+require 'linguist/tokenizer'
+module Linguist
+  # Language bayesian classifier.
+  class Classifier
+    # Public: Train classifier that data is a certain language.
+    #
+    # db       - Hash classifier database object
+    # language - String language of data
+    # data     - String contents of file
+    #
+    # Examples
+    #
+    #   Classifier.train(db, 'Ruby', "def hello; end")
+    #
+    # Returns nothing.
+    def self.train!(db, language, data)
+      tokens = Tokenizer.tokenize(data)
+      db['tokens_total'] ||= 0
+      db['languages_total'] ||= 0
+      db['tokens'] ||= {}
+      db['language_tokens'] ||= {}
+      db['languages'] ||= {}
+      tokens.each do |token|
+        db['tokens'][language] ||= {}
+        db['tokens'][language][token] ||= 0
+        db['tokens'][language][token] += 1
+        db['language_tokens'][language] ||= 0
+        db['language_tokens'][language] += 1
+        db['tokens_total'] += 1
+      end
+      db['languages'][language] ||= 0
+      db['languages'][language] += 1
+      db['languages_total'] += 1
+      nil
+    end
+    # Public: Guess language of data.
+    #
+    # db        - Hash of classifer tokens database.
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of language name Strings to restrict to.
+    #
+    # Examples
+    #
+    #   Classifier.classify(db, "def hello; end")
+    #   # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
+    #
+    # Returns sorted Array of result pairs. Each pair contains the
+    # String language name and a Float score.
+    def self.classify(db, tokens, languages = nil)
+      languages ||= db['languages'].keys
+      new(db).classify(tokens, languages)
+    end
+    # Internal: Initialize a Classifier.
+    def initialize(db = {})
+      @tokens_total    = db['tokens_total']
+      @languages_total = db['languages_total']
+      @tokens          = db['tokens']
+      @language_tokens = db['language_tokens']
+      @languages       = db['languages']
+    end
+    # Internal: Guess language of data
+    #
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of language name Strings to restrict to.
+    #
+    # Returns sorted Array of result pairs. Each pair contains the
+    # String language name and a Float score.
+    def classify(tokens, languages)
+      return [] if tokens.nil?
+      tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
+      scores = {}
+      languages.each do |language|
+        scores[language] = tokens_probability(tokens, language) +
+                                   language_probability(language)
+      end
+      scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
+    end
+    # Internal: Probably of set of tokens in a language occuring - P(D | C)
+    #
+    # tokens   - Array of String tokens.
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def tokens_probability(tokens, language)
+      tokens.inject(0.0) do |sum, token|
+        sum += Math.log(token_probability(token, language))
+      end
+    end
+    # Internal: Probably of token in language occuring - P(F | C)
+    #
+    # token    - String token.
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def token_probability(token, language)
+      if @tokens[language][token].to_f == 0.0
+        1 / @tokens_total.to_f
+      else
+        @tokens[language][token].to_f / @language_tokens[language].to_f
+      end
+    end
+    # Internal: Probably of a language occuring - P(C)
+    #
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def language_probability(language)
+      Math.log(@languages[language].to_f / @languages_total.to_f)
+    end
+  end
+end