RubyGems - tongue - Versions diffs - 0.2.10.8 - Mend

tongue 0.2.10.8

Files changed (18) hide show

checksums.yaml +7 -0
data/bin/tongue +46 -0
data/lib/linguist.rb +6 -0
data/lib/linguist/blob_helper.rb +333 -0
data/lib/linguist/classifier.rb +171 -0
data/lib/linguist/file_blob.rb +58 -0
data/lib/linguist/generated.rb +241 -0
data/lib/linguist/heuristics.rb +38 -0
data/lib/linguist/language.rb +578 -0
data/lib/linguist/languages.yml +1901 -0
data/lib/linguist/md5.rb +38 -0
data/lib/linguist/popular.yml +29 -0
data/lib/linguist/repository.rb +95 -0
data/lib/linguist/samples.json +47115 -0
data/lib/linguist/samples.rb +149 -0
data/lib/linguist/tokenizer.rb +198 -0
data/lib/linguist/vendor.yml +167 -0
metadata +143 -0

data/lib/linguist/samples.rb ADDED Viewed

@@ -0,0 +1,149 @@
+begin
+  require 'json'
+rescue LoadError
+  require 'yaml'
+end
+require 'linguist/md5'
+require 'linguist/classifier'
+module Linguist
+  # Model for accessing classifier training data.
+  module Samples
+    # Path to samples root directory
+    ROOT = File.expand_path("../../../samples", __FILE__)
+    # Path for serialized samples db
+    PATH = File.expand_path('../samples.json', __FILE__)
+    # Hash of serialized samples object
+    if File.exist?(PATH)
+      serializer = defined?(JSON) ? JSON : YAML
+      DATA = serializer.load(File.read(PATH))
+    end
+    # Public: Iterate over each sample.
+    #
+    # &block - Yields Sample to block
+    #
+    # Returns nothing.
+    def self.each(&block)
+      Dir.entries(ROOT).each do |category|
+        next if category == '.' || category == '..'
+        # Skip text and binary for now
+        # Possibly reconsider this later
+        next if category == 'Text' || category == 'Binary'
+        dirname = File.join(ROOT, category)
+        Dir.entries(dirname).each do |filename|
+          next if filename == '.' || filename == '..'
+          if filename == 'filenames'
+            Dir.entries(File.join(dirname, filename)).each do |subfilename|
+              next if subfilename == '.' || subfilename == '..'
+              yield({
+                :path    => File.join(dirname, filename, subfilename),
+                :language => category,
+                :filename => subfilename
+              })
+            end
+          else
+            if File.extname(filename) == ""
+              raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
+            end
+            yield({
+              :path     => File.join(dirname, filename),
+              :language => category,
+              :interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
+              :extname  => File.extname(filename)
+            })
+          end
+        end
+      end
+      nil
+    end
+    # Public: Build Classifier from all samples.
+    #
+    # Returns trained Classifier.
+    def self.data
+      db = {}
+      db['extnames'] = {}
+      db['interpreters'] = {}
+      db['filenames'] = {}
+      each do |sample|
+        language_name = sample[:language]
+        if sample[:extname]
+          db['extnames'][language_name] ||= []
+          if !db['extnames'][language_name].include?(sample[:extname])
+            db['extnames'][language_name] << sample[:extname]
+            db['extnames'][language_name].sort!
+          end
+        end
+        if sample[:interpreter]
+          db['interpreters'][language_name] ||= []
+          if !db['interpreters'][language_name].include?(sample[:interpreter])
+            db['interpreters'][language_name] << sample[:interpreter]
+            db['interpreters'][language_name].sort!
+          end
+        end
+        if sample[:filename]
+          db['filenames'][language_name] ||= []
+          db['filenames'][language_name] << sample[:filename]
+          db['filenames'][language_name].sort!
+        end
+        data = File.read(sample[:path])
+        Classifier.train!(db, language_name, data)
+      end
+      db['md5'] = Linguist::MD5.hexdigest(db)
+      db
+    end
+  end
+  # Used to retrieve the interpreter from the shebang line of a file's
+  # data.
+  def self.interpreter_from_shebang(data)
+    lines = data.lines.to_a
+    if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
+      bang.sub!(/^#! /, '#!')
+      tokens = bang.split(' ')
+      pieces = tokens.first.split('/')
+      if pieces.size > 1
+        script = pieces.last
+      else
+        script = pieces.first.sub('#!', '')
+      end
+      script = script == 'env' ? tokens[1] : script
+      # "python2.6" -> "python"
+      if script =~ /((?:\d+\.?)+)/
+        script.sub! $1, ''
+      end
+      # Check for multiline shebang hacks that call `exec`
+      if script == 'sh' &&
+        lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
+        script = $1
+      end
+      script
+    else
+      nil
+    end
+  end
+end

data/lib/linguist/tokenizer.rb ADDED Viewed

@@ -0,0 +1,198 @@
+require 'strscan'
+module Linguist
+  # Generic programming language tokenizer.
+  #
+  # Tokens are designed for use in the language bayes classifier.
+  # It strips any data strings or comments and preserves significant
+  # language symbols.
+  class Tokenizer
+    # Public: Extract tokens from data
+    #
+    # data - String to tokenize
+    #
+    # Returns Array of token Strings.
+    def self.tokenize(data)
+      new.extract_tokens(data)
+    end
+    # Read up to 100KB
+    BYTE_LIMIT = 100_000
+    # Start state on token, ignore anything till the next newline
+    SINGLE_LINE_COMMENTS = [
+      '//', # C
+      '#',  # Ruby
+      '%',  # Tex
+    ]
+    # Start state on opening token, ignore anything until the closing
+    # token is reached.
+    MULTI_LINE_COMMENTS = [
+      ['/*', '*/'],    # C
+      ['<!--', '-->'], # XML
+      ['{-', '-}'],    # Haskell
+      ['(*', '*)'],    # Coq
+      ['"""', '"""']   # Python
+    ]
+    START_SINGLE_LINE_COMMENT =  Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
+      "\s*#{Regexp.escape(c)} "
+    }.join("|"))
+    START_MULTI_LINE_COMMENT =  Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
+      Regexp.escape(c[0])
+    }.join("|"))
+    # Internal: Extract generic tokens from data.
+    #
+    # data - String to scan.
+    #
+    # Examples
+    #
+    #   extract_tokens("printf('Hello')")
+    #   # => ['printf', '(', ')']
+    #
+    # Returns Array of token Strings.
+    def extract_tokens(data)
+      s = StringScanner.new(data)
+      tokens = []
+      until s.eos?
+        break if s.pos >= BYTE_LIMIT
+        if token = s.scan(/^#!.+$/)
+          if name = extract_shebang(token)
+            tokens << "SHEBANG#!#{name}"
+          end
+        # Single line comment
+        elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
+          # tokens << token.strip
+          s.skip_until(/\n|\Z/)
+        # Multiline comments
+        elsif token = s.scan(START_MULTI_LINE_COMMENT)
+          # tokens << token
+          close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
+          s.skip_until(Regexp.compile(Regexp.escape(close_token)))
+          # tokens << close_token
+        # Skip single or double quoted strings
+        elsif s.scan(/"/)
+          if s.peek(1) == "\""
+            s.getch
+          else
+            s.skip_until(/[^\\]"/)
+          end
+        elsif s.scan(/'/)
+          if s.peek(1) == "'"
+            s.getch
+          else
+            s.skip_until(/[^\\]'/)
+          end
+        # Skip number literals
+        elsif s.scan(/(0x)?\d(\d|\.)*/)
+        # SGML style brackets
+        elsif token = s.scan(/<[^\s<>][^<>]*>/)
+          extract_sgml_tokens(token).each { |t| tokens << t }
+        # Common programming punctuation
+        elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
+          tokens << token
+        # Regular token
+        elsif token = s.scan(/[\w\.@#\/\*]+/)
+          tokens << token
+        # Common operators
+        elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
+          tokens << token
+        else
+          s.getch
+        end
+      end
+      tokens
+    end
+    # Internal: Extract normalized shebang command token.
+    #
+    # Examples
+    #
+    #   extract_shebang("#!/usr/bin/ruby")
+    #   # => "ruby"
+    #
+    #   extract_shebang("#!/usr/bin/env node")
+    #   # => "node"
+    #
+    # Returns String token or nil it couldn't be parsed.
+    def extract_shebang(data)
+      s = StringScanner.new(data)
+      if path = s.scan(/^#!\s*\S+/)
+        script = path.split('/').last
+        if script == 'env'
+          s.scan(/\s+/)
+          script = s.scan(/\S+/)
+        end
+        script = script[/[^\d]+/, 0] if script
+        return script
+      end
+      nil
+    end
+    # Internal: Extract tokens from inside SGML tag.
+    #
+    # data - SGML tag String.
+    #
+    # Examples
+    #
+    #   extract_sgml_tokens("<a href='' class=foo>")
+    #   # => ["<a>", "href="]
+    #
+    # Returns Array of token Strings.
+    def extract_sgml_tokens(data)
+      s = StringScanner.new(data)
+      tokens = []
+      until s.eos?
+        # Emit start token
+        if token = s.scan(/<\/?[^\s>]+/)
+          tokens << "#{token}>"
+        # Emit attributes with trailing =
+        elsif token = s.scan(/\w+=/)
+          tokens << token
+          # Then skip over attribute value
+          if s.scan(/"/)
+            s.skip_until(/[^\\]"/)
+          elsif s.scan(/'/)
+            s.skip_until(/[^\\]'/)
+          else
+            s.skip_until(/\w+/)
+          end
+        # Emit lone attributes
+        elsif token = s.scan(/\w+/)
+          tokens << token
+        # Stop at the end of the tag
+        elsif s.scan(/>/)
+          s.terminate
+        else
+          s.getch
+        end
+      end
+      tokens
+    end
+  end
+end

data/lib/linguist/vendor.yml ADDED Viewed

@@ -0,0 +1,167 @@
+# Vendored files and directories are excluded from language
+# statistics.
+#
+# Lines in this file are Regexps that are matched against the file
+# pathname.
+#
+# Please add additional test coverage to
+# `test/test_blob.rb#test_vendored` if you make any changes.
+## Vendor Conventions ##
+# Caches
+- cache/
+# Dependencies
+- ^[Dd]ependencies/
+# C deps
+#  https://github.com/joyent/node
+- ^deps/
+- ^tools/
+- (^|/)configure$
+- (^|/)configure.ac$
+- (^|/)config.guess$
+- (^|/)config.sub$
+# Node dependencies
+- node_modules/
+# Bower Components
+- bower_components/
+# Erlang bundles
+- ^rebar$
+# Bootstrap minified css and js
+- (^|/)bootstrap([^.]*)(\.min)?\.(js|css)$
+# Vendored dependencies
+- thirdparty/
+- vendors?/
+# Debian packaging
+- ^debian/
+## Commonly Bundled JavaScript frameworks ##
+# jQuery
+- (^|/)jquery([^.]*)(\.min)?\.js$
+- (^|/)jquery\-\d\.\d+(\.\d+)?(\.min)?\.js$
+# jQuery UI
+- (^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?(\.min)?\.(js|css)$
+- (^|/)jquery\.(ui|effects)\.([^.]*)(\.min)?\.(js|css)$
+# Prototype
+- (^|/)prototype(.*)\.js$
+- (^|/)effects\.js$
+- (^|/)controls\.js$
+- (^|/)dragdrop\.js$
+# MooTools
+- (^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$
+# Dojo
+- (^|/)dojo\.js$
+# MochiKit
+- (^|/)MochiKit\.js$
+# YUI
+- (^|/)yahoo-([^.]*)\.js$
+- (^|/)yui([^.]*)\.js$
+# WYS editors
+- (^|/)ckeditor\.js$
+- (^|/)tiny_mce([^.]*)\.js$
+- (^|/)tiny_mce/(langs|plugins|themes|utils)
+# MathJax
+- (^|/)MathJax/
+# SyntaxHighlighter - http://alexgorbatchev.com/
+- (^|/)shBrush([^.]*)\.js$
+- (^|/)shCore\.js$
+- (^|/)shLegacy\.js$
+# AngularJS
+- (^|/)angular([^.]*)(\.min)?\.js$
+## Python ##
+# django
+- (^|/)admin_media/
+# Fabric
+- ^fabfile\.py$
+# WAF
+- ^waf$
+# .osx
+- ^.osx$
+## Obj-C ##
+# Sparkle
+- (^|/)Sparkle/
+## .NET ##
+# Visual Studio IntelliSense
+- -vsdoc\.js$
+# jQuery validation plugin (MS bundles this with asp.net mvc)
+- (^|/)jquery([^.]*)\.validate(\.unobtrusive)?(\.min)?\.js$
+- (^|/)jquery([^.]*)\.unobtrusive\-ajax(\.min)?\.js$
+# Microsoft Ajax
+- (^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$
+# NuGet
+- ^[Pp]ackages/
+# ExtJS
+- (^|/)extjs/.*?\.js$
+- (^|/)extjs/.*?\.xml$
+- (^|/)extjs/.*?\.txt$
+- (^|/)extjs/.*?\.html$
+- (^|/)extjs/.*?\.properties$
+- (^|/)extjs/.sencha/
+- (^|/)extjs/docs/
+- (^|/)extjs/builds/
+- (^|/)extjs/cmd/
+- (^|/)extjs/examples/
+- (^|/)extjs/locale/
+- (^|/)extjs/packages/
+- (^|/)extjs/plugins/
+- (^|/)extjs/resources/
+- (^|/)extjs/src/
+- (^|/)extjs/welcome/
+# Samples folders
+- ^[Ss]amples/
+# LICENSE, README, git config files
+- ^COPYING$
+- LICENSE$
+- License$
+- gitattributes$
+- gitignore$
+- gitmodules$
+- ^README$
+- ^readme$
+# Test fixtures
+- ^[Tt]est/fixtures/
+# PhoneGap/Cordova
+- (^|/)cordova([^.]*)(\.min)?\.js$
+- (^|/)cordova\-\d\.\d(\.\d)?(\.min)?\.js$
+# Vagrant
+- ^Vagrantfile$
+# .DS_Store's
+- .[Dd][Ss]_[Ss]tore$