site_classifier 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 43547a8a99af0382b4fea86385db18428fe267ac
4
+ data.tar.gz: fa4321d7f5ecb4bdd1c415c5c9e179b1418306b5
5
+ SHA512:
6
+ metadata.gz: f1107cddc78b414c551065976f53e6a25fee56cce3dcc2d9c5245485bad06fdfd099712fd20196aa20e6e041b92fa77c667c924e24a56d8d4ce61721554e6328
7
+ data.tar.gz: 6914f5718f3fbd9c97bdd28f7b794f564f5a4f44563c26eb7f0e65df11c5b959a11a71d7cd220af402a999a4f7166cedf647954b26aa0b297c838f7797f2af3f
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ ruby '2.0.0'
4
+ # Specify your gem's dependencies in site_classifier.gemspec
5
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Elad Meidar
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,44 @@
1
+ # SiteClassifier
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'site_classifier'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install site_classifier
18
+
19
+ ## Configuration
20
+
21
+ SiteClassifier.configure do |classifier|
22
+ classifier.translate = true
23
+ classifier.google_translate_api_key = "XXX"
24
+ end
25
+
26
+ - translate (true / false) - indicates if tags should be translated to english. if `false` tags will appear in url's native language.
27
+ - google_translate_api_key (string) - if translate is `true`, supply a Google Translate API key to allow translation.
28
+
29
+ ## Usage
30
+
31
+ SiteClassifier::Extractor.parse_site("http://cnn.com")
32
+ #=> {:most_significant=>["cnn", "news", "breaking news", "busines", "sport", "entertainment", "special report"],
33
+ :language=>"auto",
34
+ :url=>"http://cnn.com",
35
+ :tags=>["cnn", "cnn news", "cnn international", "cnn international news", "cnn edition", "edition news", "news", "news online", "breaking news", "u.s. news", "world news", "global news", "weather", "business", "cnn money", "sports", "politics", "law", "technology", "entertainment", "education", "travel", "health", "special reports", "autos", "developing story", "news video", "cnn intl", "podcasts", "world blogs"],
36
+ :description=>"CNN.com International delivers breaking news from across the globe and information on the latest top stories, business, sports and entertainment headlines. Follow the news as it happens through: special reports, videos, audio, photo galleries plus interactive maps and timelines."}
37
+
38
+ ## Contributing
39
+
40
+ 1. Fork it
41
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
42
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
43
+ 4. Push to the branch (`git push origin my-new-feature`)
44
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,18 @@
1
+ module SiteClassifier
2
+ class Configuration
3
+ attr_accessor :translate, :google_translate_api_key
4
+
5
+ # Instantiate a new class
6
+ def initialize(options = {})
7
+ @translate = options[:translate] || false
8
+ @google_translate_api_key = options[:google_translate_api_key]
9
+ end
10
+
11
+ # Configure by block
12
+ def self.configure(&block)
13
+ new_configuration = SiteClassifier::Configuration.new
14
+ yield new_configuration
15
+ new_configuration
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,105 @@
1
+ module SiteClassifier
2
+ class Extractor
3
+ include HTTParty
4
+
5
+ attr_accessor :url, :tags, :description, :word_frequency, :lang
6
+
7
+ def initialize(url, tags, word_hash, description, lang)
8
+ @url = url
9
+ @tags = tags
10
+ @description = description
11
+ @word_frequency = word_hash
12
+ @lang = lang.downcase
13
+ end
14
+
15
+ # Normalize site language
16
+ def validate_lang
17
+ if EasyTranslate::LANGUAGES.keys.include?(@lang)
18
+ @lang
19
+ else
20
+ self.lang = "auto"
21
+ end
22
+ end
23
+
24
+ # Extract most significant tags
25
+ def most_significant
26
+ most_sig = []
27
+ if !description.nil?
28
+ if tags.any?
29
+ most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
30
+ else
31
+ most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
32
+ end
33
+ end
34
+
35
+ if most_sig.empty?
36
+ most_sig = self.word_frequency.keys
37
+ end
38
+
39
+ self.validate_lang
40
+
41
+ if SiteClassifier.translate_tags?
42
+ begin
43
+ if self.lang == "auto"
44
+ @lang = EasyTranslate.detect(most_sig.first, key: SiteClassifier.configuration.google_translate_api_key)
45
+ end
46
+ EasyTranslate.translate(most_sig, from: self.lang, to: :en, key: SiteClassifier.configuration.google_translate_api_key)
47
+ rescue
48
+ return most_sig
49
+ end
50
+ else
51
+ return most_sig
52
+ end
53
+ end
54
+
55
+ def to_hash
56
+ {
57
+ most_significant: most_significant,
58
+ language: self.lang,
59
+ url: url,
60
+ tags: tags,
61
+ description: description
62
+ }
63
+ end
64
+
65
+ def self.parse_site(url = "")
66
+ return if url == "" || url.nil?
67
+
68
+ html = Nokogiri::HTML(self.get(url).parsed_response)
69
+
70
+ tags = []
71
+ description = nil
72
+ word_hash = {}
73
+ page_lang = "auto"
74
+
75
+ begin
76
+ page_lang = html.search("html").first["lang"].to_s.slice(0..1)
77
+ rescue
78
+ end
79
+
80
+ begin
81
+ page_lang = html.search("html").first["xml:lang"].to_s.slice(0..1)
82
+ rescue
83
+ end
84
+
85
+ begin
86
+ tags = html.search('meta[name="keywords"]').first["content"].split(",").collect(&:strip).collect(&:downcase)
87
+ description = html.search('meta[name="description"]').first["content"]
88
+ rescue
89
+ end
90
+
91
+ if tags.empty?
92
+ word_hash = Hash.new(0)
93
+ all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
94
+ if all_text.empty?
95
+ all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
96
+ end
97
+ all_text.each do |word|
98
+ word_hash[word] += 1
99
+ end
100
+ word_hash.reject! {|k,v| v < 2 }
101
+ end
102
+ self.new(url, tags, word_hash, description, page_lang)
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,3 @@
1
+ module SiteClassifier
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,25 @@
1
+ require "site_classifier/version"
2
+ require 'httparty'
3
+ require 'easy_translate'
4
+ require 'nokogiri'
5
+ require 'active_support/inflector'
6
+
7
+ module SiteClassifier
8
+
9
+ autoload :Configuration, 'site_classifier/configuration'
10
+ autoload :Extractor, 'site_classifier/extractor'
11
+
12
+ attr_reader :setup
13
+
14
+ def self.configure(&block)
15
+ @setup = SiteClassifier::Configuration.configure(&block)
16
+ end
17
+
18
+ def self.translate_tags?
19
+ self.configuration.translate == true
20
+ end
21
+
22
+ def self.configuration
23
+ @setup ||= SiteClassifier::Configuration.new
24
+ end
25
+ end
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'site_classifier/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "site_classifier"
8
+ spec.version = SiteClassifier::VERSION
9
+ spec.authors = ["Elad Meidar"]
10
+ spec.email = ["elad@eizesus.com"]
11
+ spec.description = "Return a tag list for submitted urls"
12
+ spec.summary = "This gem extracts a list of english tags for a given url"
13
+ spec.homepage = "https://github.com/ShinobiDevs/SiteClassifier"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_dependency "httparty", "0.11.0"
25
+ spec.add_dependency "nokogiri", "1.6.0"
26
+ spec.add_dependency "easy_translate", "0.3.3"
27
+ spec.add_dependency "active_support", "3.0.0"
28
+ end
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ describe SiteClassifier::Configuration do
4
+ describe "#initialize" do
5
+ it "should access an option hash" do
6
+ conf = SiteClassifier::Configuration.new({translate: true, google_translate_api_key: "xxx"})
7
+ conf.translate.should be_true
8
+ conf.google_translate_api_key.should eq("xxx")
9
+ end
10
+ end
11
+
12
+ describe "#configure" do
13
+ it "should return a valid instance with a configuration block" do
14
+ conf = SiteClassifier::Configuration.configure do |conf|
15
+ conf.translate = true
16
+ conf.google_translate_api_key = "yyy"
17
+ end
18
+
19
+ conf.translate.should be_true
20
+ conf.google_translate_api_key.should eq("yyy")
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,50 @@
1
+ require 'spec_helper'
2
+
3
+ describe SiteClassifier::Extractor do
4
+ describe "#initialize" do
5
+ it "should create a valid instance" do
6
+ extractor = SiteClassifier::Extractor.new("http://cnn.com", ["news"], ["news", "economy"], "This is cnn", "auto")
7
+
8
+ extractor.url.should eq("http://cnn.com")
9
+ end
10
+ end
11
+
12
+ describe "#validate_lang" do
13
+ before(:all) do
14
+ @extractor = SiteClassifier::Extractor.new("http://cnn.com", ["news"], ["news", "economy"], "This is cnn", "auto")
15
+ end
16
+
17
+ it "should keep language if it is a known language to google translate" do
18
+ @extractor.validate_lang
19
+ @extractor.lang.should eq("auto")
20
+ end
21
+
22
+ it "should reset to 'auto' if language is not a known language to google translate" do
23
+ @extractor.lang = "bullshit-language"
24
+ @extractor.validate_lang
25
+ @extractor.lang.should eq("auto")
26
+ end
27
+ end
28
+
29
+ describe "#most_significant" do
30
+ before(:each) {
31
+ SiteClassifier.configuration.translate = false
32
+ @extractor = SiteClassifier::Extractor.new("http://cnn.com", ["news"], {"news" => 10, "economy" => 5, "text" => 3, "elad" => 1, "miki" => 1}, "This is cnn news", "auto")
33
+ }
34
+
35
+ it "should return a list of the most common tags that exist in description" do
36
+ @extractor.most_significant.should eq(["news"])
37
+ end
38
+
39
+ it "should return a list of most frequest used words and exist in description if tags are empty and description exists" do
40
+ @extractor.tags = []
41
+ @extractor.most_significant.should eq(["news"])
42
+ end
43
+
44
+ it "should return a list of most frequest used words if tags are empty and description missing" do
45
+ @extractor.tags = []
46
+ @extractor.description = nil
47
+ @extractor.most_significant.should eq(["news", "economy", "text", "elad", "miki"])
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,43 @@
1
+ require 'spec_helper'
2
+
3
+ describe SiteClassifier do
4
+
5
+ describe '#configure' do
6
+ it "should have a 'configure' method" do
7
+ SiteClassifier.should respond_to(:configure)
8
+ end
9
+
10
+ it "should accept block configuration" do
11
+ SiteClassifier.configure do |sc|
12
+ sc.translate = true
13
+ sc.google_translate_api_key = "xxx"
14
+ end
15
+
16
+ SiteClassifier.configuration.translate.should be_true
17
+ SiteClassifier.configuration.google_translate_api_key.should eq("xxx")
18
+ end
19
+ end
20
+
21
+ describe '#configuration' do
22
+ it "should have a 'configuration' method" do
23
+ SiteClassifier.should respond_to(:configuration)
24
+ end
25
+ end
26
+
27
+ describe "#translate_tags?" do
28
+ before(:all) do
29
+ SiteClassifier.configure do |sc|
30
+ sc.translate = true
31
+ sc.google_translate_api_key = "xxx"
32
+ end
33
+ end
34
+
35
+ it "should have a 'translate_tags?' method" do
36
+ SiteClassifier.should respond_to(:translate_tags?)
37
+ end
38
+
39
+ it "should return true if translate is set to true in configuration" do
40
+ SiteClassifier.translate_tags?.should be_true
41
+ end
42
+ end
43
+ end
@@ -0,0 +1 @@
1
+ require 'site_classifier'
metadata ADDED
@@ -0,0 +1,161 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: site_classifier
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Elad Meidar
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-09-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: httparty
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '='
60
+ - !ruby/object:Gem::Version
61
+ version: 0.11.0
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '='
67
+ - !ruby/object:Gem::Version
68
+ version: 0.11.0
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '='
74
+ - !ruby/object:Gem::Version
75
+ version: 1.6.0
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '='
81
+ - !ruby/object:Gem::Version
82
+ version: 1.6.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: easy_translate
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '='
88
+ - !ruby/object:Gem::Version
89
+ version: 0.3.3
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '='
95
+ - !ruby/object:Gem::Version
96
+ version: 0.3.3
97
+ - !ruby/object:Gem::Dependency
98
+ name: active_support
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '='
102
+ - !ruby/object:Gem::Version
103
+ version: 3.0.0
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '='
109
+ - !ruby/object:Gem::Version
110
+ version: 3.0.0
111
+ description: Return a tag list for submitted urls
112
+ email:
113
+ - elad@eizesus.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - .gitignore
119
+ - .rspec
120
+ - Gemfile
121
+ - LICENSE.txt
122
+ - README.md
123
+ - Rakefile
124
+ - lib/site_classifier.rb
125
+ - lib/site_classifier/configuration.rb
126
+ - lib/site_classifier/extractor.rb
127
+ - lib/site_classifier/version.rb
128
+ - site_classifier.gemspec
129
+ - spec/models/configuration_spec.rb
130
+ - spec/models/extractor_spec.rb
131
+ - spec/models/site_classifier_spec.rb
132
+ - spec/spec_helper.rb
133
+ homepage: https://github.com/ShinobiDevs/SiteClassifier
134
+ licenses:
135
+ - MIT
136
+ metadata: {}
137
+ post_install_message:
138
+ rdoc_options: []
139
+ require_paths:
140
+ - lib
141
+ required_ruby_version: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - '>='
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ required_rubygems_version: !ruby/object:Gem::Requirement
147
+ requirements:
148
+ - - '>='
149
+ - !ruby/object:Gem::Version
150
+ version: '0'
151
+ requirements: []
152
+ rubyforge_project:
153
+ rubygems_version: 2.0.6
154
+ signing_key:
155
+ specification_version: 4
156
+ summary: This gem extracts a list of english tags for a given url
157
+ test_files:
158
+ - spec/models/configuration_spec.rb
159
+ - spec/models/extractor_spec.rb
160
+ - spec/models/site_classifier_spec.rb
161
+ - spec/spec_helper.rb