geothird-linguist 2.6.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,98 @@
1
+ require 'yaml'
2
+
3
+ require 'linguist/md5'
4
+ require 'linguist/classifier'
5
+
6
+ module Linguist
7
+ # Model for accessing classifier training data.
8
+ module Samples
9
+ # Path to samples root directory
10
+ ROOT = File.expand_path("../../../samples", __FILE__)
11
+
12
+ # Path for serialized samples db
13
+ PATH = File.expand_path('../samples.json', __FILE__)
14
+
15
+ # Hash of serialized samples object
16
+ if File.exist?(PATH)
17
+ DATA = YAML.load_file(PATH)
18
+ end
19
+
20
+ # Public: Iterate over each sample.
21
+ #
22
+ # &block - Yields Sample to block
23
+ #
24
+ # Returns nothing.
25
+ def self.each(&block)
26
+ Dir.entries(ROOT).each do |category|
27
+ next if category == '.' || category == '..'
28
+
29
+ # Skip text and binary for now
30
+ # Possibly reconsider this later
31
+ next if category == 'Text' || category == 'Binary'
32
+
33
+ dirname = File.join(ROOT, category)
34
+ Dir.entries(dirname).each do |filename|
35
+ next if filename == '.' || filename == '..'
36
+
37
+ if filename == 'filenames'
38
+ Dir.entries(File.join(dirname, filename)).each do |subfilename|
39
+ next if subfilename == '.' || subfilename == '..'
40
+
41
+ yield({
42
+ :path => File.join(dirname, filename, subfilename),
43
+ :language => category,
44
+ :filename => subfilename
45
+ })
46
+ end
47
+ else
48
+ if File.extname(filename) == ""
49
+ raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
50
+ end
51
+
52
+ yield({
53
+ :path => File.join(dirname, filename),
54
+ :language => category,
55
+ :extname => File.extname(filename)
56
+ })
57
+ end
58
+ end
59
+ end
60
+
61
+ nil
62
+ end
63
+
64
+ # Public: Build Classifier from all samples.
65
+ #
66
+ # Returns trained Classifier.
67
+ def self.data
68
+ db = {}
69
+ db['extnames'] = {}
70
+ db['filenames'] = {}
71
+
72
+ each do |sample|
73
+ language_name = sample[:language]
74
+
75
+ if sample[:extname]
76
+ db['extnames'][language_name] ||= []
77
+ if !db['extnames'][language_name].include?(sample[:extname])
78
+ db['extnames'][language_name] << sample[:extname]
79
+ db['extnames'][language_name].sort!
80
+ end
81
+ end
82
+
83
+ if sample[:filename]
84
+ db['filenames'][language_name] ||= []
85
+ db['filenames'][language_name] << sample[:filename]
86
+ db['filenames'][language_name].sort!
87
+ end
88
+
89
+ data = File.read(sample[:path])
90
+ Classifier.train!(db, language_name, data)
91
+ end
92
+
93
+ db['md5'] = Linguist::MD5.hexdigest(db)
94
+
95
+ db
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,197 @@
1
+ require 'strscan'
2
+
3
+ module Linguist
4
+ # Generic programming language tokenizer.
5
+ #
6
+ # Tokens are designed for use in the language bayes classifier.
7
+ # It strips any data strings or comments and preserves significant
8
+ # language symbols.
9
+ class Tokenizer
10
+ # Public: Extract tokens from data
11
+ #
12
+ # data - String to tokenize
13
+ #
14
+ # Returns Array of token Strings.
15
+ def self.tokenize(data)
16
+ new.extract_tokens(data)
17
+ end
18
+
19
+ # Read up to 100KB
20
+ BYTE_LIMIT = 100_000
21
+
22
+ # Start state on token, ignore anything till the next newline
23
+ SINGLE_LINE_COMMENTS = [
24
+ '//', # C
25
+ '#', # Ruby
26
+ '%', # Tex
27
+ ]
28
+
29
+ # Start state on opening token, ignore anything until the closing
30
+ # token is reached.
31
+ MULTI_LINE_COMMENTS = [
32
+ ['/*', '*/'], # C
33
+ ['<!--', '-->'], # XML
34
+ ['{-', '-}'], # Haskell
35
+ ['(*', '*)'] # Coq
36
+ ]
37
+
38
+ START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
39
+ "\s*#{Regexp.escape(c)} "
40
+ }.join("|"))
41
+
42
+ START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
43
+ Regexp.escape(c[0])
44
+ }.join("|"))
45
+
46
+ # Internal: Extract generic tokens from data.
47
+ #
48
+ # data - String to scan.
49
+ #
50
+ # Examples
51
+ #
52
+ # extract_tokens("printf('Hello')")
53
+ # # => ['printf', '(', ')']
54
+ #
55
+ # Returns Array of token Strings.
56
+ def extract_tokens(data)
57
+ s = StringScanner.new(data)
58
+
59
+ tokens = []
60
+ until s.eos?
61
+ break if s.pos >= BYTE_LIMIT
62
+
63
+ if token = s.scan(/^#!.+$/)
64
+ if name = extract_shebang(token)
65
+ tokens << "SHEBANG#!#{name}"
66
+ end
67
+
68
+ # Single line comment
69
+ elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
70
+ # tokens << token.strip
71
+ s.skip_until(/\n|\Z/)
72
+
73
+ # Multiline comments
74
+ elsif token = s.scan(START_MULTI_LINE_COMMENT)
75
+ # tokens << token
76
+ close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
77
+ s.skip_until(Regexp.compile(Regexp.escape(close_token)))
78
+ # tokens << close_token
79
+
80
+ # Skip single or double quoted strings
81
+ elsif s.scan(/"/)
82
+ if s.peek(1) == "\""
83
+ s.getch
84
+ else
85
+ s.skip_until(/[^\\]"/)
86
+ end
87
+ elsif s.scan(/'/)
88
+ if s.peek(1) == "'"
89
+ s.getch
90
+ else
91
+ s.skip_until(/[^\\]'/)
92
+ end
93
+
94
+ # Skip number literals
95
+ elsif s.scan(/(0x)?\d(\d|\.)*/)
96
+
97
+ # SGML style brackets
98
+ elsif token = s.scan(/<[^\s<>][^<>]*>/)
99
+ extract_sgml_tokens(token).each { |t| tokens << t }
100
+
101
+ # Common programming punctuation
102
+ elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
103
+ tokens << token
104
+
105
+ # Regular token
106
+ elsif token = s.scan(/[\w\.@#\/\*]+/)
107
+ tokens << token
108
+
109
+ # Common operators
110
+ elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
111
+ tokens << token
112
+
113
+ else
114
+ s.getch
115
+ end
116
+ end
117
+
118
+ tokens
119
+ end
120
+
121
+ # Internal: Extract normalized shebang command token.
122
+ #
123
+ # Examples
124
+ #
125
+ # extract_shebang("#!/usr/bin/ruby")
126
+ # # => "ruby"
127
+ #
128
+ # extract_shebang("#!/usr/bin/env node")
129
+ # # => "node"
130
+ #
131
+ # Returns String token or nil it couldn't be parsed.
132
+ def extract_shebang(data)
133
+ s = StringScanner.new(data)
134
+
135
+ if path = s.scan(/^#!\s*\S+/)
136
+ script = path.split('/').last
137
+ if script == 'env'
138
+ s.scan(/\s+/)
139
+ script = s.scan(/\S+/)
140
+ end
141
+ script = script[/[^\d]+/, 0] if script
142
+ return script
143
+ end
144
+
145
+ nil
146
+ end
147
+
148
+ # Internal: Extract tokens from inside SGML tag.
149
+ #
150
+ # data - SGML tag String.
151
+ #
152
+ # Examples
153
+ #
154
+ # extract_sgml_tokens("<a href='' class=foo>")
155
+ # # => ["<a>", "href="]
156
+ #
157
+ # Returns Array of token Strings.
158
+ def extract_sgml_tokens(data)
159
+ s = StringScanner.new(data)
160
+
161
+ tokens = []
162
+
163
+ until s.eos?
164
+ # Emit start token
165
+ if token = s.scan(/<\/?[^\s>]+/)
166
+ tokens << "#{token}>"
167
+
168
+ # Emit attributes with trailing =
169
+ elsif token = s.scan(/\w+=/)
170
+ tokens << token
171
+
172
+ # Then skip over attribute value
173
+ if s.scan(/"/)
174
+ s.skip_until(/[^\\]"/)
175
+ elsif s.scan(/'/)
176
+ s.skip_until(/[^\\]'/)
177
+ else
178
+ s.skip_until(/\w+/)
179
+ end
180
+
181
+ # Emit lone attributes
182
+ elsif token = s.scan(/\w+/)
183
+ tokens << token
184
+
185
+ # Stop at the end of the tag
186
+ elsif s.scan(/>/)
187
+ s.terminate
188
+
189
+ else
190
+ s.getch
191
+ end
192
+ end
193
+
194
+ tokens
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,106 @@
1
+ # Vendored files and directories are excluded from language
2
+ # statistics.
3
+ #
4
+ # Lines in this file are Regexps that are matched against the file
5
+ # pathname.
6
+ #
7
+ # Please add additional test coverage to
8
+ # `test/test_blob.rb#test_vendored` if you make any changes.
9
+
10
+ ## Vendor Conventions ##
11
+
12
+ # Caches
13
+ - cache/
14
+
15
+ # C deps
16
+ # https://github.com/joyent/node
17
+ - ^deps/
18
+ - ^tools/
19
+
20
+ # Node depedencies
21
+ - node_modules/
22
+
23
+ # Vendored depedencies
24
+ - vendor/
25
+
26
+ # Debian packaging
27
+ - ^debian/
28
+
29
+ ## Commonly Bundled JavaScript frameworks ##
30
+
31
+ # jQuery
32
+ - (^|/)jquery([^.]*)(\.min)?\.js$
33
+ - (^|/)jquery\-\d\.\d(\.\d)?(\.min)?\.js$
34
+
35
+ # Prototype
36
+ - (^|/)prototype(.*)\.js$
37
+ - (^|/)effects\.js$
38
+ - (^|/)controls\.js$
39
+ - (^|/)dragdrop\.js$
40
+
41
+ # MooTools
42
+ - (^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$
43
+
44
+ # Dojo
45
+ - (^|/)dojo\.js$
46
+
47
+ # MochiKit
48
+ - (^|/)MochiKit\.js$
49
+
50
+ # YUI
51
+ - (^|/)yahoo-([^.]*)\.js$
52
+ - (^|/)yui([^.]*)\.js$
53
+
54
+ # LESS css
55
+ - (^|/)less([^.]*)(\.min)?\.js$
56
+ - (^|/)less\-\d+\.\d+\.\d+(\.min)?\.js$
57
+
58
+ # WYS editors
59
+ - (^|/)ckeditor\.js$
60
+ - (^|/)tiny_mce([^.]*)\.js$
61
+ - (^|/)tiny_mce/(langs|plugins|themes|utils)
62
+
63
+ # MathJax
64
+ - (^|/)MathJax/
65
+
66
+ # SyntaxHighlighter - http://alexgorbatchev.com/
67
+ - (^|/)shBrush([^.]*)\.js$
68
+ - (^|/)shCore\.js$
69
+ - (^|/)shLegacy\.js$
70
+
71
+ ## Python ##
72
+
73
+ # django
74
+ - (^|/)admin_media/
75
+
76
+ # Fabric
77
+ - ^fabfile\.py$
78
+
79
+ # WAF
80
+ - ^waf$
81
+
82
+
83
+ ## Obj-C ##
84
+
85
+ # Sparkle
86
+ - (^|/)Sparkle/
87
+
88
+ ## .NET ##
89
+
90
+ # Visual Studio IntelliSense
91
+ - -vsdoc\.js$
92
+
93
+ # jQuery validation plugin (MS bundles this with asp.net mvc)
94
+ - (^|/)jquery([^.]*)\.validate(\.min)?\.js$
95
+
96
+ # Microsoft Ajax
97
+ - (^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$
98
+
99
+ # NuGet
100
+ - ^[Pp]ackages/
101
+
102
+ # ExtJS
103
+ - (^|/)extjs/
104
+
105
+ # Samples folders
106
+ - ^[Ss]amples/
metadata ADDED
@@ -0,0 +1,170 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: geothird-linguist
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.6.1
5
+ platform: ruby
6
+ authors:
7
+ - GitHub
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-04-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: charlock_holmes_bundle_icu
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 0.6.9.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 0.6.9.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: escape_utils
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.2.3
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 0.2.3
41
+ - !ruby/object:Gem::Dependency
42
+ name: mime-types
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '1.19'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '1.19'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pygments.rb
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.4.2
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 0.4.2
69
+ - !ruby/object:Gem::Dependency
70
+ name: mocha
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: json
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rake
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: yajl-ruby
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description:
126
+ email:
127
+ executables:
128
+ - linguist
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - lib/linguist/blob_helper.rb
133
+ - lib/linguist/classifier.rb
134
+ - lib/linguist/file_blob.rb
135
+ - lib/linguist/generated.rb
136
+ - lib/linguist/language.rb
137
+ - lib/linguist/languages.yml
138
+ - lib/linguist/md5.rb
139
+ - lib/linguist/popular.yml
140
+ - lib/linguist/repository.rb
141
+ - lib/linguist/samples.json
142
+ - lib/linguist/samples.rb
143
+ - lib/linguist/tokenizer.rb
144
+ - lib/linguist/vendor.yml
145
+ - lib/linguist.rb
146
+ - bin/linguist
147
+ homepage: https://github.com/github/linguist
148
+ licenses: []
149
+ metadata: {}
150
+ post_install_message:
151
+ rdoc_options: []
152
+ require_paths:
153
+ - lib
154
+ required_ruby_version: !ruby/object:Gem::Requirement
155
+ requirements:
156
+ - - '>='
157
+ - !ruby/object:Gem::Version
158
+ version: '0'
159
+ required_rubygems_version: !ruby/object:Gem::Requirement
160
+ requirements:
161
+ - - '>='
162
+ - !ruby/object:Gem::Version
163
+ version: '0'
164
+ requirements: []
165
+ rubyforge_project:
166
+ rubygems_version: 2.0.0
167
+ signing_key:
168
+ specification_version: 4
169
+ summary: GitHub Language detection
170
+ test_files: []