github-linguist 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,7 +26,7 @@ module Linguist
26
26
  @overrides.include?(extension)
27
27
  end
28
28
 
29
- # Include?: Return overridden extensions.
29
+ # Internal: Return overridden extensions.
30
30
  #
31
31
  # Returns extensions Array.
32
32
  def self.overridden_extensions
@@ -0,0 +1,74 @@
1
+ require 'linguist/classifier'
2
+ require 'linguist/language'
3
+
4
+ module Linguist
5
+ # Model for accessing classifier training data.
6
+ class Sample
7
+ # Samples live in test/ for now, we'll eventually move them out
8
+ PATH = File.expand_path("../../../test/fixtures", __FILE__)
9
+
10
+ # Public: Iterate over each Sample.
11
+ #
12
+ # &block - Yields Sample to block
13
+ #
14
+ # Returns nothing.
15
+ def self.each(&block)
16
+ Dir.entries(PATH).each do |category|
17
+ next if category == '.' || category == '..'
18
+
19
+ # Skip text and binary for now
20
+ # Possibly reconsider this later
21
+ next if category == 'text' || category == 'binary'
22
+
23
+ # Map directory name to a Language alias
24
+ language = Linguist::Language.find_by_alias(category)
25
+ raise "No language for #{category.inspect}" unless language
26
+
27
+ dirname = File.join(PATH, category)
28
+ Dir.entries(dirname).each do |filename|
29
+ next if filename == '.' || filename == '..'
30
+ yield new(File.join(dirname, filename), language)
31
+ end
32
+ end
33
+
34
+ nil
35
+ end
36
+
37
+ # Public: Build Classifier from all samples.
38
+ #
39
+ # Returns trained Classifier.
40
+ def self.classifier
41
+ classifier = Classifier.new
42
+ each { |sample| classifier.train(sample.language, sample.data) }
43
+ classifier.gc
44
+ end
45
+
46
+ # Internal: Initialize Sample.
47
+ #
48
+ # Samples should be initialized by Sample.each.
49
+ #
50
+ # path - String full path to file.
51
+ # language - Language of sample.
52
+ def initialize(path, language)
53
+ @path = path
54
+ @language = language
55
+ end
56
+
57
+ # Public: Get full path to file.
58
+ #
59
+ # Returns String.
60
+ attr_reader :path
61
+
62
+ # Public: Get sample language.
63
+ #
64
+ # Returns Language.
65
+ attr_reader :language
66
+
67
+ # Public: Read file contents.
68
+ #
69
+ # Returns String.
70
+ def data
71
+ File.read(path)
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,157 @@
1
+ module Linguist
2
+ # Generic programming language tokenizer.
3
+ #
4
+ # Tokens are designed for use in the language bayes classifier.
5
+ # It strips any data strings or comments and preserves significant
6
+ # language symbols.
7
+ class Tokenizer
8
+ # Public: Initialize a Tokenizer.
9
+ #
10
+ # data - String data to scan.
11
+ def initialize(data)
12
+ @data = data
13
+ end
14
+
15
+ # Public: Get source data.
16
+ #
17
+ # Returns String.
18
+ attr_reader :data
19
+
20
+ # Public: Extract tokens from data.
21
+ #
22
+ # Returns Array of token Strings.
23
+ def tokens
24
+ extract_tokens(data)
25
+ end
26
+
27
+ # Internal: Extract generic tokens from data.
28
+ #
29
+ # data - String to scan.
30
+ #
31
+ # Examples
32
+ #
33
+ # extract_tokens("printf('Hello')")
34
+ # # => ['printf', '(', ')']
35
+ #
36
+ # Returns Array of token Strings.
37
+ def extract_tokens(data)
38
+ s = StringScanner.new(data)
39
+
40
+ tokens = []
41
+ until s.eos?
42
+ # Ruby single line comment
43
+ if token = s.scan(/# /)
44
+ tokens << "#"
45
+ s.skip_until(/\n|\Z/)
46
+
47
+ # C style single line comment
48
+ elsif token = s.scan(/\/\/ /)
49
+ tokens << "//"
50
+ s.skip_until(/\n|\Z/)
51
+
52
+ # Leading Tex or Matlab comments
53
+ elsif token = s.scan(/\n%/)
54
+ tokens << "%"
55
+ s.skip_until(/\n|\Z/)
56
+
57
+ # C multiline comments
58
+ elsif token = s.scan(/\/\*/)
59
+ tokens << "/*"
60
+ s.skip_until(/\*\//)
61
+ tokens << "*/"
62
+
63
+ # Haskell multiline comments
64
+ elsif token = s.scan(/\{-/)
65
+ tokens << "{-"
66
+ s.skip_until(/-\}/)
67
+ tokens << "-}"
68
+
69
+ # XML multiline comments
70
+ elsif token = s.scan(/<!--/)
71
+ tokens << "<!--"
72
+ s.skip_until(/-->/)
73
+ tokens << "-->"
74
+
75
+ # Skip single or double quoted strings
76
+ elsif s.scan(/"/)
77
+ s.skip_until(/[^\\]"/)
78
+ elsif s.scan(/'/)
79
+ s.skip_until(/[^\\]'/)
80
+
81
+ # Skip number literals
82
+ elsif s.scan(/(0x)?\d+/)
83
+
84
+ # SGML style brackets
85
+ elsif token = s.scan(/<[^\s<>][^<>]*>/)
86
+ extract_sgml_tokens(token).each { |t| tokens << t }
87
+
88
+ # Common programming punctuation
89
+ elsif token = s.scan(/;|\{|\}|\(|\)/)
90
+ tokens << token
91
+
92
+ # Regular token
93
+ elsif token = s.scan(/[\w\.@#\/\*]+/)
94
+ tokens << token
95
+
96
+ # Common operators
97
+ elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
98
+ tokens << token
99
+
100
+ else
101
+ s.getch
102
+ end
103
+ end
104
+
105
+ tokens
106
+ end
107
+
108
+ # Internal: Extract tokens from inside SGML tag.
109
+ #
110
+ # data - SGML tag String.
111
+ #
112
+ # Examples
113
+ #
114
+ # extract_sgml_tokens("<a href='' class=foo>")
115
+ # # => ["<a>", "href="]
116
+ #
117
+ # Returns Array of token Strings.
118
+ def extract_sgml_tokens(data)
119
+ s = StringScanner.new(data)
120
+
121
+ tokens = []
122
+
123
+ until s.eos?
124
+ # Emit start token
125
+ if token = s.scan(/<\/?[^\s>]+/)
126
+ tokens << "#{token}>"
127
+
128
+ # Emit attributes with trailing =
129
+ elsif token = s.scan(/\w+=/)
130
+ tokens << token
131
+
132
+ # Then skip over attribute value
133
+ if s.scan(/"/)
134
+ s.skip_until(/[^\\]"/)
135
+ elsif s.scan(/'/)
136
+ s.skip_until(/[^\\]'/)
137
+ else
138
+ s.skip_until(/\w+/)
139
+ end
140
+
141
+ # Emit lone attributes
142
+ elsif token = s.scan(/\w+/)
143
+ tokens << token
144
+
145
+ # Stop at the end of the tag
146
+ elsif s.scan(/>/)
147
+ s.terminate
148
+
149
+ else
150
+ s.getch
151
+ end
152
+ end
153
+
154
+ tokens
155
+ end
156
+ end
157
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: github-linguist
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 15
5
5
  prerelease:
6
6
  segments:
7
- - 1
7
+ - 2
8
8
  - 0
9
9
  - 0
10
- version: 1.0.0
10
+ version: 2.0.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - GitHub
@@ -81,7 +81,7 @@ dependencies:
81
81
  type: :runtime
82
82
  version_requirements: *id004
83
83
  - !ruby/object:Gem::Dependency
84
- name: rake
84
+ name: json
85
85
  prerelease: false
86
86
  requirement: &id005 !ruby/object:Gem::Requirement
87
87
  none: false
@@ -94,6 +94,20 @@ dependencies:
94
94
  version: "0"
95
95
  type: :development
96
96
  version_requirements: *id005
97
+ - !ruby/object:Gem::Dependency
98
+ name: rake
99
+ prerelease: false
100
+ requirement: &id006 !ruby/object:Gem::Requirement
101
+ none: false
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ hash: 3
106
+ segments:
107
+ - 0
108
+ version: "0"
109
+ type: :development
110
+ version_requirements: *id006
97
111
  description:
98
112
  email:
99
113
  executables:
@@ -104,6 +118,8 @@ extra_rdoc_files: []
104
118
 
105
119
  files:
106
120
  - lib/linguist/blob_helper.rb
121
+ - lib/linguist/classifier.rb
122
+ - lib/linguist/classifier.yml
107
123
  - lib/linguist/file_blob.rb
108
124
  - lib/linguist/language.rb
109
125
  - lib/linguist/languages.yml
@@ -112,6 +128,8 @@ files:
112
128
  - lib/linguist/pathname.rb
113
129
  - lib/linguist/popular.yml
114
130
  - lib/linguist/repository.rb
131
+ - lib/linguist/sample.rb
132
+ - lib/linguist/tokenizer.rb
115
133
  - lib/linguist/vendor.yml
116
134
  - lib/linguist.rb
117
135
  - bin/linguist