github-linguist 1.0.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -26,7 +26,7 @@ module Linguist
26
26
  @overrides.include?(extension)
27
27
  end
28
28
 
29
- # Include?: Return overridden extensions.
29
+ # Internal: Return overridden extensions.
30
30
  #
31
31
  # Returns extensions Array.
32
32
  def self.overridden_extensions
@@ -0,0 +1,74 @@
1
+ require 'linguist/classifier'
2
+ require 'linguist/language'
3
+
4
+ module Linguist
5
+ # Model for accessing classifier training data.
6
+ class Sample
7
+ # Samples live in test/ for now, we'll eventually move them out
8
+ PATH = File.expand_path("../../../test/fixtures", __FILE__)
9
+
10
+ # Public: Iterate over each Sample.
11
+ #
12
+ # &block - Yields Sample to block
13
+ #
14
+ # Returns nothing.
15
+ def self.each(&block)
16
+ Dir.entries(PATH).each do |category|
17
+ next if category == '.' || category == '..'
18
+
19
+ # Skip text and binary for now
20
+ # Possibly reconsider this later
21
+ next if category == 'text' || category == 'binary'
22
+
23
+ # Map directory name to a Language alias
24
+ language = Linguist::Language.find_by_alias(category)
25
+ raise "No language for #{category.inspect}" unless language
26
+
27
+ dirname = File.join(PATH, category)
28
+ Dir.entries(dirname).each do |filename|
29
+ next if filename == '.' || filename == '..'
30
+ yield new(File.join(dirname, filename), language)
31
+ end
32
+ end
33
+
34
+ nil
35
+ end
36
+
37
+ # Public: Build Classifier from all samples.
38
+ #
39
+ # Returns trained Classifier.
40
+ def self.classifier
41
+ classifier = Classifier.new
42
+ each { |sample| classifier.train(sample.language, sample.data) }
43
+ classifier.gc
44
+ end
45
+
46
+ # Internal: Initialize Sample.
47
+ #
48
+ # Samples should be initialized by Sample.each.
49
+ #
50
+ # path - String full path to file.
51
+ # language - Language of sample.
52
+ def initialize(path, language)
53
+ @path = path
54
+ @language = language
55
+ end
56
+
57
+ # Public: Get full path to file.
58
+ #
59
+ # Returns String.
60
+ attr_reader :path
61
+
62
+ # Public: Get sample language.
63
+ #
64
+ # Returns Language.
65
+ attr_reader :language
66
+
67
+ # Public: Read file contents.
68
+ #
69
+ # Returns String.
70
+ def data
71
+ File.read(path)
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,157 @@
1
+ module Linguist
2
+ # Generic programming language tokenizer.
3
+ #
4
+ # Tokens are designed for use in the language bayes classifier.
5
+ # It strips any data strings or comments and preserves significant
6
+ # language symbols.
7
+ class Tokenizer
8
+ # Public: Initialize a Tokenizer.
9
+ #
10
+ # data - String data to scan.
11
+ def initialize(data)
12
+ @data = data
13
+ end
14
+
15
+ # Public: Get source data.
16
+ #
17
+ # Returns String.
18
+ attr_reader :data
19
+
20
+ # Public: Extract tokens from data.
21
+ #
22
+ # Returns Array of token Strings.
23
+ def tokens
24
+ extract_tokens(data)
25
+ end
26
+
27
+ # Internal: Extract generic tokens from data.
28
+ #
29
+ # data - String to scan.
30
+ #
31
+ # Examples
32
+ #
33
+ # extract_tokens("printf('Hello')")
34
+ # # => ['printf', '(', ')']
35
+ #
36
+ # Returns Array of token Strings.
37
+ def extract_tokens(data)
38
+ s = StringScanner.new(data)
39
+
40
+ tokens = []
41
+ until s.eos?
42
+ # Ruby single line comment
43
+ if token = s.scan(/# /)
44
+ tokens << "#"
45
+ s.skip_until(/\n|\Z/)
46
+
47
+ # C style single line comment
48
+ elsif token = s.scan(/\/\/ /)
49
+ tokens << "//"
50
+ s.skip_until(/\n|\Z/)
51
+
52
+ # Leading Tex or Matlab comments
53
+ elsif token = s.scan(/\n%/)
54
+ tokens << "%"
55
+ s.skip_until(/\n|\Z/)
56
+
57
+ # C multiline comments
58
+ elsif token = s.scan(/\/\*/)
59
+ tokens << "/*"
60
+ s.skip_until(/\*\//)
61
+ tokens << "*/"
62
+
63
+ # Haskell multiline comments
64
+ elsif token = s.scan(/\{-/)
65
+ tokens << "{-"
66
+ s.skip_until(/-\}/)
67
+ tokens << "-}"
68
+
69
+ # XML multiline comments
70
+ elsif token = s.scan(/<!--/)
71
+ tokens << "<!--"
72
+ s.skip_until(/-->/)
73
+ tokens << "-->"
74
+
75
+ # Skip single or double quoted strings
76
+ elsif s.scan(/"/)
77
+ s.skip_until(/[^\\]"/)
78
+ elsif s.scan(/'/)
79
+ s.skip_until(/[^\\]'/)
80
+
81
+ # Skip number literals
82
+ elsif s.scan(/(0x)?\d+/)
83
+
84
+ # SGML style brackets
85
+ elsif token = s.scan(/<[^\s<>][^<>]*>/)
86
+ extract_sgml_tokens(token).each { |t| tokens << t }
87
+
88
+ # Common programming punctuation
89
+ elsif token = s.scan(/;|\{|\}|\(|\)/)
90
+ tokens << token
91
+
92
+ # Regular token
93
+ elsif token = s.scan(/[\w\.@#\/\*]+/)
94
+ tokens << token
95
+
96
+ # Common operators
97
+ elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
98
+ tokens << token
99
+
100
+ else
101
+ s.getch
102
+ end
103
+ end
104
+
105
+ tokens
106
+ end
107
+
108
+ # Internal: Extract tokens from inside SGML tag.
109
+ #
110
+ # data - SGML tag String.
111
+ #
112
+ # Examples
113
+ #
114
+ # extract_sgml_tokens("<a href='' class=foo>")
115
+ # # => ["<a>", "href="]
116
+ #
117
+ # Returns Array of token Strings.
118
+ def extract_sgml_tokens(data)
119
+ s = StringScanner.new(data)
120
+
121
+ tokens = []
122
+
123
+ until s.eos?
124
+ # Emit start token
125
+ if token = s.scan(/<\/?[^\s>]+/)
126
+ tokens << "#{token}>"
127
+
128
+ # Emit attributes with trailing =
129
+ elsif token = s.scan(/\w+=/)
130
+ tokens << token
131
+
132
+ # Then skip over attribute value
133
+ if s.scan(/"/)
134
+ s.skip_until(/[^\\]"/)
135
+ elsif s.scan(/'/)
136
+ s.skip_until(/[^\\]'/)
137
+ else
138
+ s.skip_until(/\w+/)
139
+ end
140
+
141
+ # Emit lone attributes
142
+ elsif token = s.scan(/\w+/)
143
+ tokens << token
144
+
145
+ # Stop at the end of the tag
146
+ elsif s.scan(/>/)
147
+ s.terminate
148
+
149
+ else
150
+ s.getch
151
+ end
152
+ end
153
+
154
+ tokens
155
+ end
156
+ end
157
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: github-linguist
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 15
5
5
  prerelease:
6
6
  segments:
7
- - 1
7
+ - 2
8
8
  - 0
9
9
  - 0
10
- version: 1.0.0
10
+ version: 2.0.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - GitHub
@@ -81,7 +81,7 @@ dependencies:
81
81
  type: :runtime
82
82
  version_requirements: *id004
83
83
  - !ruby/object:Gem::Dependency
84
- name: rake
84
+ name: json
85
85
  prerelease: false
86
86
  requirement: &id005 !ruby/object:Gem::Requirement
87
87
  none: false
@@ -94,6 +94,20 @@ dependencies:
94
94
  version: "0"
95
95
  type: :development
96
96
  version_requirements: *id005
97
+ - !ruby/object:Gem::Dependency
98
+ name: rake
99
+ prerelease: false
100
+ requirement: &id006 !ruby/object:Gem::Requirement
101
+ none: false
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ hash: 3
106
+ segments:
107
+ - 0
108
+ version: "0"
109
+ type: :development
110
+ version_requirements: *id006
97
111
  description:
98
112
  email:
99
113
  executables:
@@ -104,6 +118,8 @@ extra_rdoc_files: []
104
118
 
105
119
  files:
106
120
  - lib/linguist/blob_helper.rb
121
+ - lib/linguist/classifier.rb
122
+ - lib/linguist/classifier.yml
107
123
  - lib/linguist/file_blob.rb
108
124
  - lib/linguist/language.rb
109
125
  - lib/linguist/languages.yml
@@ -112,6 +128,8 @@ files:
112
128
  - lib/linguist/pathname.rb
113
129
  - lib/linguist/popular.yml
114
130
  - lib/linguist/repository.rb
131
+ - lib/linguist/sample.rb
132
+ - lib/linguist/tokenizer.rb
115
133
  - lib/linguist/vendor.yml
116
134
  - lib/linguist.rb
117
135
  - bin/linguist