gitlab-linguist 2.9.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,98 @@
1
+ require 'yaml'
2
+
3
+ require 'linguist/md5'
4
+ require 'linguist/classifier'
5
+
6
+ module Linguist
7
+ # Model for accessing classifier training data.
8
+ module Samples
9
+ # Path to samples root directory
10
+ ROOT = File.expand_path("../../../samples", __FILE__)
11
+
12
+ # Path for serialized samples db
13
+ PATH = File.expand_path('../samples.json', __FILE__)
14
+
15
+ # Hash of serialized samples object
16
+ if File.exist?(PATH)
17
+ DATA = YAML.load_file(PATH)
18
+ end
19
+
20
+ # Public: Iterate over each sample.
21
+ #
22
+ # &block - Yields Sample to block
23
+ #
24
+ # Returns nothing.
25
+ def self.each(&block)
26
+ Dir.entries(ROOT).each do |category|
27
+ next if category == '.' || category == '..'
28
+
29
+ # Skip text and binary for now
30
+ # Possibly reconsider this later
31
+ next if category == 'Text' || category == 'Binary'
32
+
33
+ dirname = File.join(ROOT, category)
34
+ Dir.entries(dirname).each do |filename|
35
+ next if filename == '.' || filename == '..'
36
+
37
+ if filename == 'filenames'
38
+ Dir.entries(File.join(dirname, filename)).each do |subfilename|
39
+ next if subfilename == '.' || subfilename == '..'
40
+
41
+ yield({
42
+ :path => File.join(dirname, filename, subfilename),
43
+ :language => category,
44
+ :filename => subfilename
45
+ })
46
+ end
47
+ else
48
+ if File.extname(filename) == ""
49
+ raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
50
+ end
51
+
52
+ yield({
53
+ :path => File.join(dirname, filename),
54
+ :language => category,
55
+ :extname => File.extname(filename)
56
+ })
57
+ end
58
+ end
59
+ end
60
+
61
+ nil
62
+ end
63
+
64
+ # Public: Build Classifier from all samples.
65
+ #
66
+ # Returns trained Classifier.
67
+ def self.data
68
+ db = {}
69
+ db['extnames'] = {}
70
+ db['filenames'] = {}
71
+
72
+ each do |sample|
73
+ language_name = sample[:language]
74
+
75
+ if sample[:extname]
76
+ db['extnames'][language_name] ||= []
77
+ if !db['extnames'][language_name].include?(sample[:extname])
78
+ db['extnames'][language_name] << sample[:extname]
79
+ db['extnames'][language_name].sort!
80
+ end
81
+ end
82
+
83
+ if sample[:filename]
84
+ db['filenames'][language_name] ||= []
85
+ db['filenames'][language_name] << sample[:filename]
86
+ db['filenames'][language_name].sort!
87
+ end
88
+
89
+ data = File.read(sample[:path])
90
+ Classifier.train!(db, language_name, data)
91
+ end
92
+
93
+ db['md5'] = Linguist::MD5.hexdigest(db)
94
+
95
+ db
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,198 @@
1
+ require 'strscan'
2
+
3
+ module Linguist
4
+ # Generic programming language tokenizer.
5
+ #
6
+ # Tokens are designed for use in the language bayes classifier.
7
+ # It strips any data strings or comments and preserves significant
8
+ # language symbols.
9
+ class Tokenizer
10
+ # Public: Extract tokens from data
11
+ #
12
+ # data - String to tokenize
13
+ #
14
+ # Returns Array of token Strings.
15
+ def self.tokenize(data)
16
+ new.extract_tokens(data)
17
+ end
18
+
19
+ # Read up to 100KB
20
+ BYTE_LIMIT = 100_000
21
+
22
+ # Start state on token, ignore anything till the next newline
23
+ SINGLE_LINE_COMMENTS = [
24
+ '//', # C
25
+ '#', # Ruby
26
+ '%', # Tex
27
+ ]
28
+
29
+ # Start state on opening token, ignore anything until the closing
30
+ # token is reached.
31
+ MULTI_LINE_COMMENTS = [
32
+ ['/*', '*/'], # C
33
+ ['<!--', '-->'], # XML
34
+ ['{-', '-}'], # Haskell
35
+ ['(*', '*)'], # Coq
36
+ ['"""', '"""'] # Python
37
+ ]
38
+
39
+ START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
40
+ "\s*#{Regexp.escape(c)} "
41
+ }.join("|"))
42
+
43
+ START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
44
+ Regexp.escape(c[0])
45
+ }.join("|"))
46
+
47
+ # Internal: Extract generic tokens from data.
48
+ #
49
+ # data - String to scan.
50
+ #
51
+ # Examples
52
+ #
53
+ # extract_tokens("printf('Hello')")
54
+ # # => ['printf', '(', ')']
55
+ #
56
+ # Returns Array of token Strings.
57
+ def extract_tokens(data)
58
+ s = StringScanner.new(data)
59
+
60
+ tokens = []
61
+ until s.eos?
62
+ break if s.pos >= BYTE_LIMIT
63
+
64
+ if token = s.scan(/^#!.+$/)
65
+ if name = extract_shebang(token)
66
+ tokens << "SHEBANG#!#{name}"
67
+ end
68
+
69
+ # Single line comment
70
+ elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
71
+ # tokens << token.strip
72
+ s.skip_until(/\n|\Z/)
73
+
74
+ # Multiline comments
75
+ elsif token = s.scan(START_MULTI_LINE_COMMENT)
76
+ # tokens << token
77
+ close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
78
+ s.skip_until(Regexp.compile(Regexp.escape(close_token)))
79
+ # tokens << close_token
80
+
81
+ # Skip single or double quoted strings
82
+ elsif s.scan(/"/)
83
+ if s.peek(1) == "\""
84
+ s.getch
85
+ else
86
+ s.skip_until(/[^\\]"/)
87
+ end
88
+ elsif s.scan(/'/)
89
+ if s.peek(1) == "'"
90
+ s.getch
91
+ else
92
+ s.skip_until(/[^\\]'/)
93
+ end
94
+
95
+ # Skip number literals
96
+ elsif s.scan(/(0x)?\d(\d|\.)*/)
97
+
98
+ # SGML style brackets
99
+ elsif token = s.scan(/<[^\s<>][^<>]*>/)
100
+ extract_sgml_tokens(token).each { |t| tokens << t }
101
+
102
+ # Common programming punctuation
103
+ elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
104
+ tokens << token
105
+
106
+ # Regular token
107
+ elsif token = s.scan(/[\w\.@#\/\*]+/)
108
+ tokens << token
109
+
110
+ # Common operators
111
+ elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
112
+ tokens << token
113
+
114
+ else
115
+ s.getch
116
+ end
117
+ end
118
+
119
+ tokens
120
+ end
121
+
122
+ # Internal: Extract normalized shebang command token.
123
+ #
124
+ # Examples
125
+ #
126
+ # extract_shebang("#!/usr/bin/ruby")
127
+ # # => "ruby"
128
+ #
129
+ # extract_shebang("#!/usr/bin/env node")
130
+ # # => "node"
131
+ #
132
+ # Returns String token or nil it couldn't be parsed.
133
+ def extract_shebang(data)
134
+ s = StringScanner.new(data)
135
+
136
+ if path = s.scan(/^#!\s*\S+/)
137
+ script = path.split('/').last
138
+ if script == 'env'
139
+ s.scan(/\s+/)
140
+ script = s.scan(/\S+/)
141
+ end
142
+ script = script[/[^\d]+/, 0] if script
143
+ return script
144
+ end
145
+
146
+ nil
147
+ end
148
+
149
+ # Internal: Extract tokens from inside SGML tag.
150
+ #
151
+ # data - SGML tag String.
152
+ #
153
+ # Examples
154
+ #
155
+ # extract_sgml_tokens("<a href='' class=foo>")
156
+ # # => ["<a>", "href="]
157
+ #
158
+ # Returns Array of token Strings.
159
+ def extract_sgml_tokens(data)
160
+ s = StringScanner.new(data)
161
+
162
+ tokens = []
163
+
164
+ until s.eos?
165
+ # Emit start token
166
+ if token = s.scan(/<\/?[^\s>]+/)
167
+ tokens << "#{token}>"
168
+
169
+ # Emit attributes with trailing =
170
+ elsif token = s.scan(/\w+=/)
171
+ tokens << token
172
+
173
+ # Then skip over attribute value
174
+ if s.scan(/"/)
175
+ s.skip_until(/[^\\]"/)
176
+ elsif s.scan(/'/)
177
+ s.skip_until(/[^\\]'/)
178
+ else
179
+ s.skip_until(/\w+/)
180
+ end
181
+
182
+ # Emit lone attributes
183
+ elsif token = s.scan(/\w+/)
184
+ tokens << token
185
+
186
+ # Stop at the end of the tag
187
+ elsif s.scan(/>/)
188
+ s.terminate
189
+
190
+ else
191
+ s.getch
192
+ end
193
+ end
194
+
195
+ tokens
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,129 @@
1
+ # Vendored files and directories are excluded from language
2
+ # statistics.
3
+ #
4
+ # Lines in this file are Regexps that are matched against the file
5
+ # pathname.
6
+ #
7
+ # Please add additional test coverage to
8
+ # `test/test_blob.rb#test_vendored` if you make any changes.
9
+
10
+ ## Vendor Conventions ##
11
+
12
+ # Caches
13
+ - cache/
14
+
15
+ # C deps
16
+ # https://github.com/joyent/node
17
+ - ^deps/
18
+ - ^tools/
19
+ - (^|/)configure$
20
+ - (^|/)configure.ac$
21
+ - (^|/)config.guess$
22
+ - (^|/)config.sub$
23
+
24
+ # Node dependencies
25
+ - node_modules/
26
+
27
+ # Erlang bundles
28
+ - ^rebar$
29
+
30
+ # Vendored dependencies
31
+ - vendor/
32
+
33
+ # Debian packaging
34
+ - ^debian/
35
+
36
+ ## Commonly Bundled JavaScript frameworks ##
37
+
38
+ # jQuery
39
+ - (^|/)jquery([^.]*)(\.min)?\.js$
40
+ - (^|/)jquery\-\d\.\d+(\.\d+)?(\.min)?\.js$
41
+
42
+ # jQuery UI
43
+ - (^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?(\.min)?\.(js|css)$
44
+ - (^|/)jquery\.(ui|effects)\.([^.]*)(\.min)?\.(js|css)$
45
+
46
+ # Prototype
47
+ - (^|/)prototype(.*)\.js$
48
+ - (^|/)effects\.js$
49
+ - (^|/)controls\.js$
50
+ - (^|/)dragdrop\.js$
51
+
52
+ # MooTools
53
+ - (^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$
54
+
55
+ # Dojo
56
+ - (^|/)dojo\.js$
57
+
58
+ # MochiKit
59
+ - (^|/)MochiKit\.js$
60
+
61
+ # YUI
62
+ - (^|/)yahoo-([^.]*)\.js$
63
+ - (^|/)yui([^.]*)\.js$
64
+
65
+ # WYS editors
66
+ - (^|/)ckeditor\.js$
67
+ - (^|/)tiny_mce([^.]*)\.js$
68
+ - (^|/)tiny_mce/(langs|plugins|themes|utils)
69
+
70
+ # MathJax
71
+ - (^|/)MathJax/
72
+
73
+ # SyntaxHighlighter - http://alexgorbatchev.com/
74
+ - (^|/)shBrush([^.]*)\.js$
75
+ - (^|/)shCore\.js$
76
+ - (^|/)shLegacy\.js$
77
+
78
+ ## Python ##
79
+
80
+ # django
81
+ - (^|/)admin_media/
82
+
83
+ # Fabric
84
+ - ^fabfile\.py$
85
+
86
+ # WAF
87
+ - ^waf$
88
+
89
+
90
+ ## Obj-C ##
91
+
92
+ # Sparkle
93
+ - (^|/)Sparkle/
94
+
95
+ ## .NET ##
96
+
97
+ # Visual Studio IntelliSense
98
+ - -vsdoc\.js$
99
+
100
+ # jQuery validation plugin (MS bundles this with asp.net mvc)
101
+ - (^|/)jquery([^.]*)\.validate(\.unobtrusive)?(\.min)?\.js$
102
+ - (^|/)jquery([^.]*)\.unobtrusive\-ajax(\.min)?\.js$
103
+
104
+ # Microsoft Ajax
105
+ - (^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$
106
+
107
+ # NuGet
108
+ - ^[Pp]ackages/
109
+
110
+ # ExtJS
111
+ - (^|/)extjs/
112
+
113
+ # Samples folders
114
+ - ^[Ss]amples/
115
+
116
+ # LICENSE, README, git config files
117
+ - ^COPYING$
118
+ - ^LICENSE$
119
+ - gitattributes$
120
+ - gitignore$
121
+ - gitmodules$
122
+ - ^README$
123
+ - ^readme$
124
+
125
+ # Test fixtures
126
+ - ^[Tt]est/fixtures/
127
+
128
+ # .DS_Store's
129
+ - .[Dd][Ss]_[Ss]tore$
metadata ADDED
@@ -0,0 +1,171 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gitlab-linguist
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.9.5
5
+ platform: ruby
6
+ authors:
7
+ - GitHub
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-11-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: charlock_holmes
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 0.6.6
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 0.6.6
27
+ - !ruby/object:Gem::Dependency
28
+ name: escape_utils
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.3.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 0.3.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: mime-types
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '1.19'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '1.19'
55
+ - !ruby/object:Gem::Dependency
56
+ name: gitlab-pygments.rb
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.5.4
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 0.5.4
69
+ - !ruby/object:Gem::Dependency
70
+ name: mocha
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: json
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rake
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: yajl-ruby
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description:
126
+ email:
127
+ executables:
128
+ - linguist
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - lib/linguist/language.rb
133
+ - lib/linguist/vendor.yml
134
+ - lib/linguist/repository.rb
135
+ - lib/linguist/samples.rb
136
+ - lib/linguist/samples.json
137
+ - lib/linguist/generated.rb
138
+ - lib/linguist/md5.rb
139
+ - lib/linguist/popular.yml
140
+ - lib/linguist/classifier.rb
141
+ - lib/linguist/tokenizer.rb
142
+ - lib/linguist/blob_helper.rb
143
+ - lib/linguist/languages.yml
144
+ - lib/linguist/file_blob.rb
145
+ - lib/linguist.rb
146
+ - bin/linguist
147
+ homepage: https://github.com/github/linguist
148
+ licenses:
149
+ - MIT
150
+ metadata: {}
151
+ post_install_message:
152
+ rdoc_options: []
153
+ require_paths:
154
+ - lib
155
+ required_ruby_version: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - '>='
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ required_rubygems_version: !ruby/object:Gem::Requirement
161
+ requirements:
162
+ - - '>='
163
+ - !ruby/object:Gem::Version
164
+ version: '0'
165
+ requirements: []
166
+ rubyforge_project:
167
+ rubygems_version: 2.0.3
168
+ signing_key:
169
+ specification_version: 4
170
+ summary: GitHub Language detection
171
+ test_files: []