geothird-linguist 2.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,98 @@
1
+ require 'yaml'
2
+
3
+ require 'linguist/md5'
4
+ require 'linguist/classifier'
5
+
6
+ module Linguist
7
+ # Model for accessing classifier training data.
8
+ module Samples
9
+ # Path to samples root directory
10
+ ROOT = File.expand_path("../../../samples", __FILE__)
11
+
12
+ # Path for serialized samples db
13
+ PATH = File.expand_path('../samples.json', __FILE__)
14
+
15
+ # Hash of serialized samples object
16
+ if File.exist?(PATH)
17
+ DATA = YAML.load_file(PATH)
18
+ end
19
+
20
+ # Public: Iterate over each sample.
21
+ #
22
+ # &block - Yields Sample to block
23
+ #
24
+ # Returns nothing.
25
+ def self.each(&block)
26
+ Dir.entries(ROOT).each do |category|
27
+ next if category == '.' || category == '..'
28
+
29
+ # Skip text and binary for now
30
+ # Possibly reconsider this later
31
+ next if category == 'Text' || category == 'Binary'
32
+
33
+ dirname = File.join(ROOT, category)
34
+ Dir.entries(dirname).each do |filename|
35
+ next if filename == '.' || filename == '..'
36
+
37
+ if filename == 'filenames'
38
+ Dir.entries(File.join(dirname, filename)).each do |subfilename|
39
+ next if subfilename == '.' || subfilename == '..'
40
+
41
+ yield({
42
+ :path => File.join(dirname, filename, subfilename),
43
+ :language => category,
44
+ :filename => subfilename
45
+ })
46
+ end
47
+ else
48
+ if File.extname(filename) == ""
49
+ raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
50
+ end
51
+
52
+ yield({
53
+ :path => File.join(dirname, filename),
54
+ :language => category,
55
+ :extname => File.extname(filename)
56
+ })
57
+ end
58
+ end
59
+ end
60
+
61
+ nil
62
+ end
63
+
64
+ # Public: Build Classifier from all samples.
65
+ #
66
+ # Returns trained Classifier.
67
+ def self.data
68
+ db = {}
69
+ db['extnames'] = {}
70
+ db['filenames'] = {}
71
+
72
+ each do |sample|
73
+ language_name = sample[:language]
74
+
75
+ if sample[:extname]
76
+ db['extnames'][language_name] ||= []
77
+ if !db['extnames'][language_name].include?(sample[:extname])
78
+ db['extnames'][language_name] << sample[:extname]
79
+ db['extnames'][language_name].sort!
80
+ end
81
+ end
82
+
83
+ if sample[:filename]
84
+ db['filenames'][language_name] ||= []
85
+ db['filenames'][language_name] << sample[:filename]
86
+ db['filenames'][language_name].sort!
87
+ end
88
+
89
+ data = File.read(sample[:path])
90
+ Classifier.train!(db, language_name, data)
91
+ end
92
+
93
+ db['md5'] = Linguist::MD5.hexdigest(db)
94
+
95
+ db
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,197 @@
1
+ require 'strscan'
2
+
3
+ module Linguist
4
+ # Generic programming language tokenizer.
5
+ #
6
+ # Tokens are designed for use in the language bayes classifier.
7
+ # It strips any data strings or comments and preserves significant
8
+ # language symbols.
9
+ class Tokenizer
10
+ # Public: Extract tokens from data
11
+ #
12
+ # data - String to tokenize
13
+ #
14
+ # Returns Array of token Strings.
15
+ def self.tokenize(data)
16
+ new.extract_tokens(data)
17
+ end
18
+
19
+ # Read up to 100KB
20
+ BYTE_LIMIT = 100_000
21
+
22
+ # Start state on token, ignore anything till the next newline
23
+ SINGLE_LINE_COMMENTS = [
24
+ '//', # C
25
+ '#', # Ruby
26
+ '%', # Tex
27
+ ]
28
+
29
+ # Start state on opening token, ignore anything until the closing
30
+ # token is reached.
31
+ MULTI_LINE_COMMENTS = [
32
+ ['/*', '*/'], # C
33
+ ['<!--', '-->'], # XML
34
+ ['{-', '-}'], # Haskell
35
+ ['(*', '*)'] # Coq
36
+ ]
37
+
38
+ START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
39
+ "\s*#{Regexp.escape(c)} "
40
+ }.join("|"))
41
+
42
+ START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
43
+ Regexp.escape(c[0])
44
+ }.join("|"))
45
+
46
+ # Internal: Extract generic tokens from data.
47
+ #
48
+ # data - String to scan.
49
+ #
50
+ # Examples
51
+ #
52
+ # extract_tokens("printf('Hello')")
53
+ # # => ['printf', '(', ')']
54
+ #
55
+ # Returns Array of token Strings.
56
+ def extract_tokens(data)
57
+ s = StringScanner.new(data)
58
+
59
+ tokens = []
60
+ until s.eos?
61
+ break if s.pos >= BYTE_LIMIT
62
+
63
+ if token = s.scan(/^#!.+$/)
64
+ if name = extract_shebang(token)
65
+ tokens << "SHEBANG#!#{name}"
66
+ end
67
+
68
+ # Single line comment
69
+ elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
70
+ # tokens << token.strip
71
+ s.skip_until(/\n|\Z/)
72
+
73
+ # Multiline comments
74
+ elsif token = s.scan(START_MULTI_LINE_COMMENT)
75
+ # tokens << token
76
+ close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
77
+ s.skip_until(Regexp.compile(Regexp.escape(close_token)))
78
+ # tokens << close_token
79
+
80
+ # Skip single or double quoted strings
81
+ elsif s.scan(/"/)
82
+ if s.peek(1) == "\""
83
+ s.getch
84
+ else
85
+ s.skip_until(/[^\\]"/)
86
+ end
87
+ elsif s.scan(/'/)
88
+ if s.peek(1) == "'"
89
+ s.getch
90
+ else
91
+ s.skip_until(/[^\\]'/)
92
+ end
93
+
94
+ # Skip number literals
95
+ elsif s.scan(/(0x)?\d(\d|\.)*/)
96
+
97
+ # SGML style brackets
98
+ elsif token = s.scan(/<[^\s<>][^<>]*>/)
99
+ extract_sgml_tokens(token).each { |t| tokens << t }
100
+
101
+ # Common programming punctuation
102
+ elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
103
+ tokens << token
104
+
105
+ # Regular token
106
+ elsif token = s.scan(/[\w\.@#\/\*]+/)
107
+ tokens << token
108
+
109
+ # Common operators
110
+ elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
111
+ tokens << token
112
+
113
+ else
114
+ s.getch
115
+ end
116
+ end
117
+
118
+ tokens
119
+ end
120
+
121
+ # Internal: Extract normalized shebang command token.
122
+ #
123
+ # Examples
124
+ #
125
+ # extract_shebang("#!/usr/bin/ruby")
126
+ # # => "ruby"
127
+ #
128
+ # extract_shebang("#!/usr/bin/env node")
129
+ # # => "node"
130
+ #
131
+ # Returns String token or nil it couldn't be parsed.
132
+ def extract_shebang(data)
133
+ s = StringScanner.new(data)
134
+
135
+ if path = s.scan(/^#!\s*\S+/)
136
+ script = path.split('/').last
137
+ if script == 'env'
138
+ s.scan(/\s+/)
139
+ script = s.scan(/\S+/)
140
+ end
141
+ script = script[/[^\d]+/, 0] if script
142
+ return script
143
+ end
144
+
145
+ nil
146
+ end
147
+
148
+ # Internal: Extract tokens from inside SGML tag.
149
+ #
150
+ # data - SGML tag String.
151
+ #
152
+ # Examples
153
+ #
154
+ # extract_sgml_tokens("<a href='' class=foo>")
155
+ # # => ["<a>", "href="]
156
+ #
157
+ # Returns Array of token Strings.
158
+ def extract_sgml_tokens(data)
159
+ s = StringScanner.new(data)
160
+
161
+ tokens = []
162
+
163
+ until s.eos?
164
+ # Emit start token
165
+ if token = s.scan(/<\/?[^\s>]+/)
166
+ tokens << "#{token}>"
167
+
168
+ # Emit attributes with trailing =
169
+ elsif token = s.scan(/\w+=/)
170
+ tokens << token
171
+
172
+ # Then skip over attribute value
173
+ if s.scan(/"/)
174
+ s.skip_until(/[^\\]"/)
175
+ elsif s.scan(/'/)
176
+ s.skip_until(/[^\\]'/)
177
+ else
178
+ s.skip_until(/\w+/)
179
+ end
180
+
181
+ # Emit lone attributes
182
+ elsif token = s.scan(/\w+/)
183
+ tokens << token
184
+
185
+ # Stop at the end of the tag
186
+ elsif s.scan(/>/)
187
+ s.terminate
188
+
189
+ else
190
+ s.getch
191
+ end
192
+ end
193
+
194
+ tokens
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,106 @@
1
+ # Vendored files and directories are excluded from language
2
+ # statistics.
3
+ #
4
+ # Lines in this file are Regexps that are matched against the file
5
+ # pathname.
6
+ #
7
+ # Please add additional test coverage to
8
+ # `test/test_blob.rb#test_vendored` if you make any changes.
9
+
10
+ ## Vendor Conventions ##
11
+
12
+ # Caches
13
+ - cache/
14
+
15
+ # C deps
16
+ # https://github.com/joyent/node
17
+ - ^deps/
18
+ - ^tools/
19
+
20
+ # Node depedencies
21
+ - node_modules/
22
+
23
+ # Vendored depedencies
24
+ - vendor/
25
+
26
+ # Debian packaging
27
+ - ^debian/
28
+
29
+ ## Commonly Bundled JavaScript frameworks ##
30
+
31
+ # jQuery
32
+ - (^|/)jquery([^.]*)(\.min)?\.js$
33
+ - (^|/)jquery\-\d\.\d(\.\d)?(\.min)?\.js$
34
+
35
+ # Prototype
36
+ - (^|/)prototype(.*)\.js$
37
+ - (^|/)effects\.js$
38
+ - (^|/)controls\.js$
39
+ - (^|/)dragdrop\.js$
40
+
41
+ # MooTools
42
+ - (^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$
43
+
44
+ # Dojo
45
+ - (^|/)dojo\.js$
46
+
47
+ # MochiKit
48
+ - (^|/)MochiKit\.js$
49
+
50
+ # YUI
51
+ - (^|/)yahoo-([^.]*)\.js$
52
+ - (^|/)yui([^.]*)\.js$
53
+
54
+ # LESS css
55
+ - (^|/)less([^.]*)(\.min)?\.js$
56
+ - (^|/)less\-\d+\.\d+\.\d+(\.min)?\.js$
57
+
58
+ # WYS editors
59
+ - (^|/)ckeditor\.js$
60
+ - (^|/)tiny_mce([^.]*)\.js$
61
+ - (^|/)tiny_mce/(langs|plugins|themes|utils)
62
+
63
+ # MathJax
64
+ - (^|/)MathJax/
65
+
66
+ # SyntaxHighlighter - http://alexgorbatchev.com/
67
+ - (^|/)shBrush([^.]*)\.js$
68
+ - (^|/)shCore\.js$
69
+ - (^|/)shLegacy\.js$
70
+
71
+ ## Python ##
72
+
73
+ # django
74
+ - (^|/)admin_media/
75
+
76
+ # Fabric
77
+ - ^fabfile\.py$
78
+
79
+ # WAF
80
+ - ^waf$
81
+
82
+
83
+ ## Obj-C ##
84
+
85
+ # Sparkle
86
+ - (^|/)Sparkle/
87
+
88
+ ## .NET ##
89
+
90
+ # Visual Studio IntelliSense
91
+ - -vsdoc\.js$
92
+
93
+ # jQuery validation plugin (MS bundles this with asp.net mvc)
94
+ - (^|/)jquery([^.]*)\.validate(\.min)?\.js$
95
+
96
+ # Microsoft Ajax
97
+ - (^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$
98
+
99
+ # NuGet
100
+ - ^[Pp]ackages/
101
+
102
+ # ExtJS
103
+ - (^|/)extjs/
104
+
105
+ # Samples folders
106
+ - ^[Ss]amples/
metadata ADDED
@@ -0,0 +1,170 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: geothird-linguist
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.6.1
5
+ platform: ruby
6
+ authors:
7
+ - GitHub
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-04-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: charlock_holmes_bundle_icu
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 0.6.9.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 0.6.9.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: escape_utils
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.2.3
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 0.2.3
41
+ - !ruby/object:Gem::Dependency
42
+ name: mime-types
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '1.19'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '1.19'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pygments.rb
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.4.2
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 0.4.2
69
+ - !ruby/object:Gem::Dependency
70
+ name: mocha
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: json
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rake
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: yajl-ruby
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description:
126
+ email:
127
+ executables:
128
+ - linguist
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - lib/linguist/blob_helper.rb
133
+ - lib/linguist/classifier.rb
134
+ - lib/linguist/file_blob.rb
135
+ - lib/linguist/generated.rb
136
+ - lib/linguist/language.rb
137
+ - lib/linguist/languages.yml
138
+ - lib/linguist/md5.rb
139
+ - lib/linguist/popular.yml
140
+ - lib/linguist/repository.rb
141
+ - lib/linguist/samples.json
142
+ - lib/linguist/samples.rb
143
+ - lib/linguist/tokenizer.rb
144
+ - lib/linguist/vendor.yml
145
+ - lib/linguist.rb
146
+ - bin/linguist
147
+ homepage: https://github.com/github/linguist
148
+ licenses: []
149
+ metadata: {}
150
+ post_install_message:
151
+ rdoc_options: []
152
+ require_paths:
153
+ - lib
154
+ required_ruby_version: !ruby/object:Gem::Requirement
155
+ requirements:
156
+ - - '>='
157
+ - !ruby/object:Gem::Version
158
+ version: '0'
159
+ required_rubygems_version: !ruby/object:Gem::Requirement
160
+ requirements:
161
+ - - '>='
162
+ - !ruby/object:Gem::Version
163
+ version: '0'
164
+ requirements: []
165
+ rubyforge_project:
166
+ rubygems_version: 2.0.0
167
+ signing_key:
168
+ specification_version: 4
169
+ summary: GitHub Language detection
170
+ test_files: []