RubyGems - github-linguist - Versions diffs - 1.0.0 → 2.0.0 - Mend

github-linguist 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/lib/linguist/blob_helper.rb +10 -2
data/lib/linguist/classifier.rb +183 -0
data/lib/linguist/classifier.yml +19013 -0
data/lib/linguist/language.rb +1 -1
data/lib/linguist/sample.rb +74 -0
data/lib/linguist/tokenizer.rb +157 -0
metadata +22 -4

data/lib/linguist/language.rb CHANGED Viewed

@@ -26,7 +26,7 @@ module Linguist
       @overrides.include?(extension)
     end
-    # Include?: Return overridden extensions.
+    # Internal: Return overridden extensions.
     #
     # Returns extensions Array.
     def self.overridden_extensions

data/lib/linguist/sample.rb ADDED Viewed

@@ -0,0 +1,74 @@
+require 'linguist/classifier'
+require 'linguist/language'
+module Linguist
+  # Model for accessing classifier training data.
+  class Sample
+    # Samples live in test/ for now, we'll eventually move them out
+    PATH = File.expand_path("../../../test/fixtures", __FILE__)
+    # Public: Iterate over each Sample.
+    #
+    # &block - Yields Sample to block
+    #
+    # Returns nothing.
+    def self.each(&block)
+      Dir.entries(PATH).each do |category|
+        next if category == '.' || category == '..'
+        # Skip text and binary for now
+        # Possibly reconsider this later
+        next if category == 'text' || category == 'binary'
+        # Map directory name to a Language alias
+        language = Linguist::Language.find_by_alias(category)
+        raise "No language for #{category.inspect}" unless language
+        dirname = File.join(PATH, category)
+        Dir.entries(dirname).each do |filename|
+          next if filename == '.' || filename == '..'
+          yield new(File.join(dirname, filename), language)
+        end
+      end
+      nil
+    end
+    # Public: Build Classifier from all samples.
+    #
+    # Returns trained Classifier.
+    def self.classifier
+      classifier = Classifier.new
+      each { |sample| classifier.train(sample.language, sample.data) }
+      classifier.gc
+    end
+    # Internal: Initialize Sample.
+    #
+    # Samples should be initialized by Sample.each.
+    #
+    # path     - String full path to file.
+    # language - Language of sample.
+    def initialize(path, language)
+      @path     = path
+      @language = language
+    end
+    # Public: Get full path to file.
+    #
+    # Returns String.
+    attr_reader :path
+    # Public: Get sample language.
+    #
+    # Returns Language.
+    attr_reader :language
+    # Public: Read file contents.
+    #
+    # Returns String.
+    def data
+      File.read(path)
+    end
+  end
+end

data/lib/linguist/tokenizer.rb ADDED Viewed

@@ -0,0 +1,157 @@
+module Linguist
+  # Generic programming language tokenizer.
+  #
+  # Tokens are designed for use in the language bayes classifier.
+  # It strips any data strings or comments and preserves significant
+  # language symbols.
+  class Tokenizer
+    # Public: Initialize a Tokenizer.
+    #
+    # data - String data to scan.
+    def initialize(data)
+      @data = data
+    end
+    # Public: Get source data.
+    #
+    # Returns String.
+    attr_reader :data
+    # Public: Extract tokens from data.
+    #
+    # Returns Array of token Strings.
+    def tokens
+      extract_tokens(data)
+    end
+    # Internal: Extract generic tokens from data.
+    #
+    # data - String to scan.
+    #
+    # Examples
+    #
+    #   extract_tokens("printf('Hello')")
+    #   # => ['printf', '(', ')']
+    #
+    # Returns Array of token Strings.
+    def extract_tokens(data)
+      s = StringScanner.new(data)
+      tokens = []
+      until s.eos?
+        # Ruby single line comment
+        if token = s.scan(/# /)
+          tokens << "#"
+          s.skip_until(/\n|\Z/)
+        # C style single line comment
+        elsif token = s.scan(/\/\/ /)
+          tokens << "//"
+          s.skip_until(/\n|\Z/)
+        # Leading Tex or Matlab comments
+        elsif token = s.scan(/\n%/)
+          tokens << "%"
+          s.skip_until(/\n|\Z/)
+        # C multiline comments
+        elsif token = s.scan(/\/\*/)
+          tokens << "/*"
+          s.skip_until(/\*\//)
+          tokens << "*/"
+        # Haskell multiline comments
+        elsif token = s.scan(/\{-/)
+          tokens << "{-"
+          s.skip_until(/-\}/)
+          tokens << "-}"
+        # XML multiline comments
+        elsif token = s.scan(/<!--/)
+          tokens << "<!--"
+          s.skip_until(/-->/)
+          tokens << "-->"
+        # Skip single or double quoted strings
+        elsif s.scan(/"/)
+          s.skip_until(/[^\\]"/)
+        elsif s.scan(/'/)
+          s.skip_until(/[^\\]'/)
+        # Skip number literals
+        elsif s.scan(/(0x)?\d+/)
+        # SGML style brackets
+        elsif token = s.scan(/<[^\s<>][^<>]*>/)
+          extract_sgml_tokens(token).each { |t| tokens << t }
+        # Common programming punctuation
+        elsif token = s.scan(/;|\{|\}|\(|\)/)
+          tokens << token
+        # Regular token
+        elsif token = s.scan(/[\w\.@#\/\*]+/)
+          tokens << token
+        # Common operators
+        elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
+          tokens << token
+        else
+          s.getch
+        end
+      end
+      tokens
+    end
+    # Internal: Extract tokens from inside SGML tag.
+    #
+    # data - SGML tag String.
+    #
+    # Examples
+    #
+    #   extract_sgml_tokens("<a href='' class=foo>")
+    #   # => ["<a>", "href="]
+    #
+    # Returns Array of token Strings.
+    def extract_sgml_tokens(data)
+      s = StringScanner.new(data)
+      tokens = []
+      until s.eos?
+        # Emit start token
+        if token = s.scan(/<\/?[^\s>]+/)
+          tokens << "#{token}>"
+        # Emit attributes with trailing =
+        elsif token = s.scan(/\w+=/)
+          tokens << token
+          # Then skip over attribute value
+          if s.scan(/"/)
+            s.skip_until(/[^\\]"/)
+          elsif s.scan(/'/)
+            s.skip_until(/[^\\]'/)
+          else
+            s.skip_until(/\w+/)
+          end
+        # Emit lone attributes
+        elsif token = s.scan(/\w+/)
+          tokens << token
+        # Stop at the end of the tag
+        elsif s.scan(/>/)
+          s.terminate
+        else
+          s.getch
+        end
+      end
+      tokens
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: github-linguist
 version: !ruby/object:Gem::Version
-  hash: 23
+  hash: 15
   prerelease:
   segments:
-  - 1
+  - 2
   - 0
   - 0
-  version: 1.0.0
+  version: 2.0.0
 platform: ruby
 authors:
 - GitHub
@@ -81,7 +81,7 @@ dependencies:
   type: :runtime
   version_requirements: *id004
 - !ruby/object:Gem::Dependency
-  name: rake
+  name: json
   prerelease: false
   requirement: &id005 !ruby/object:Gem::Requirement
     none: false
@@ -94,6 +94,20 @@ dependencies:
         version: "0"
   type: :development
   version_requirements: *id005
+- !ruby/object:Gem::Dependency
+  name: rake
+  prerelease: false
+  requirement: &id006 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :development
+  version_requirements: *id006
 description:
 email:
 executables:
@@ -104,6 +118,8 @@ extra_rdoc_files: []
 files:
 - lib/linguist/blob_helper.rb
+- lib/linguist/classifier.rb
+- lib/linguist/classifier.yml
 - lib/linguist/file_blob.rb
 - lib/linguist/language.rb
 - lib/linguist/languages.yml
@@ -112,6 +128,8 @@ files:
 - lib/linguist/pathname.rb
 - lib/linguist/popular.yml
 - lib/linguist/repository.rb
+- lib/linguist/sample.rb
+- lib/linguist/tokenizer.rb
 - lib/linguist/vendor.yml
 - lib/linguist.rb
 - bin/linguist