plagiarism2 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bff4f9ef7060e99ba754a8e85d88ded43bc92234
4
- data.tar.gz: fb32b027c73191b7724644a4f169db41bd93135b
3
+ metadata.gz: a5ae907124afcc6b114cb2fd1a575b7d0a4c5c85
4
+ data.tar.gz: b6c4a942896b1933ce60a7e3216b8c839bf422a5
5
5
  SHA512:
6
- metadata.gz: c919a74db75b832676a998db8a5084cd7b93c2da18d171178ad98091f7d363bb4058399a397e8bacc401fd61f00b64d7133ab4c8a7421c5483e79eef99e4d4bc
7
- data.tar.gz: a38c0840f60715c6c32bdcb8673d2217128d5d945c4bbb6133addc8b1a119bace2aa951f5fa3675cf04bbcf01953651a4cb67ec97fa1beb7b4a7021032557d13
6
+ metadata.gz: bb93fde868664ee036d0fdaa13ee7de53235a372ec41daf5d72109f490b6985094e44afbd8cd189b77693f1a9283d2216010b7c040a5792505874b58e321ffdd
7
+ data.tar.gz: 93f3cf1b8fed1d7071a80ce4c92383beec0a1b49de3d27f6956c645cc269b8140e4b16d86bfd63384def1ab4ffa8feb9a70b8bf24dfe610f61f120e7b7ea65ef
data/README.md CHANGED
@@ -22,20 +22,24 @@ Add in your config
22
22
 
23
23
  ```ruby
24
24
  Plagiarism.configure do |config|
25
- config.strategies = [xxx] # => [:google, :bing, :duck, :yahoo]
25
+ config.strategies = [xxx] # => [:google, :bing, :duck, :yahoo, :free_google]
26
26
  config.whitelists = ['www.ring.md']
27
27
  end
28
28
  ```
29
29
 
30
+ > There is a limit in using duck, yahoo and free google. If you spam request, they will mark you as spam
31
+
30
32
  + Using bing engine, you have to set access key (you can get it from [here](https://datamarket.azure.com/dataset/bing/searchweb))
31
33
 
32
34
  ```ruby
35
+ config.strategies = :bing
33
36
  config.bing_key = xxx
34
37
  ```
35
38
 
36
- + Using google engine, you have to set two keys (you can get it from [here](https://developers.google.com/custom-search/json-api/v1/using_rest))
39
+ + Using google engine (not free google), you have to set two keys (you can get it from [here](https://developers.google.com/custom-search/json-api/v1/using_rest))
37
40
 
38
41
  ```ruby
42
+ config.strategies = :google
39
43
  config.google_key = xxx
40
44
  config.google_cx = xx
41
45
  ```
@@ -6,11 +6,22 @@ module Plagiarism
6
6
  include Thor::Actions
7
7
  CONFIG_PATH = '~/.plagiarism.yml'
8
8
 
9
+ class << self
10
+ def load_config(path)
11
+ config = YAML.load_file(File.expand_path path)
12
+ Config.strategies = config['strategies'].split(',').map &:strip
13
+ Config.whitelists = config['whitelists'].split(',').map &:strip
14
+ Config.bing_key = config['bing_key']
15
+ Config.google_key = config['google_key']
16
+ Config.google_cx = config['google_cx']
17
+ end
18
+ end
19
+
9
20
  desc 'init', 'Create file for plagiarism to load config'
10
21
  method_option :path, aliases: '-p', desc: 'where put the config', type: :string, default: CONFIG_PATH
11
22
  def init
12
23
  create_file options[:path], <<-STRING
13
- strategies: "duck, yahoo"
24
+ strategies: "yahoo"
14
25
  whitelists: "www.ring.md, blog.ring.md"
15
26
  bing_key: xxx
16
27
  google_key: xxx
@@ -22,14 +33,16 @@ google_cx: xxx
22
33
  method_option :content, aliases: '-c', desc: 'content which need to checked', type: :string, required: true
23
34
  method_option :path, aliases: '-p', desc: 'where put the config', type: :string, default: CONFIG_PATH
24
35
  def unique
25
- config = YAML.load_file(File.expand_path options[:path])
26
- Config.strategies = config['strategies'].split(',').map &:strip
27
- Config.whitelists = config['whitelists'].split(',').map &:strip
28
- Config.bing_key = config['bing_key']
29
- Config.google_key = config['google_key']
30
- Config.google_cx = config['google_cx']
31
-
36
+ Cli.load_config(options[:path])
32
37
  puts Plagiarism.unique? options[:content]
33
38
  end
39
+
40
+ desc 'match', 'Get the first match from search engines'
41
+ method_option :content, aliases: '-c', desc: 'content which need to get', type: :string, required: true
42
+ method_option :path, aliases: '-p', desc: 'where put the config', type: :string, default: CONFIG_PATH
43
+ def match
44
+ Cli.load_config(options[:path])
45
+ puts Plagiarism.match options[:content]
46
+ end
34
47
  end
35
48
  end
@@ -9,11 +9,11 @@ module Plagiarism
9
9
  Typhoeus.get(URL, params: params.merge('$format' => :json, 'Query' => "'#{content}'"), userpwd: ":#{Config.bing_key}")
10
10
  end
11
11
 
12
- def exists?(response)
12
+ def iterate(response)
13
13
  results = JSON.parse(response)['d']['results'] rescue []
14
14
  results.all? do |r|
15
15
  uri = URI.parse URI::encode(r['Url'])
16
- uri.host =~ whitelists_regex
16
+ yield uri
17
17
  end
18
18
  end
19
19
 
@@ -9,17 +9,16 @@ module Plagiarism
9
9
  Typhoeus.get(URL, params: params.merge(q: content))
10
10
  end
11
11
 
12
- def exists?(response)
12
+ def iterate(response)
13
13
  doc = Nokogiri::HTML response
14
14
  doc.css('.results_links_deep:not(.result--no-result)').all? do |row|
15
15
  href = row.at_css('.result__a').attributes['href'].value rescue ''
16
16
  uri = URI.parse URI::encode(href)
17
- uri.host =~ whitelists_regex
17
+ yield uri
18
18
  end
19
19
  end
20
20
 
21
21
  end
22
-
23
22
  end
24
23
  end
25
24
  end
@@ -10,6 +10,14 @@ module Plagiarism
10
10
  raise
11
11
  end
12
12
 
13
+ def iterate(r)
14
+ raise
15
+ end
16
+
17
+ def exists?(response)
18
+ iterate(response) { |uri| uri.host =~ whitelists_regex }
19
+ end
20
+
13
21
  def valid_segments(ps, params)
14
22
  ps.segment.count do |sentence|
15
23
  typhoeus = fetch("\"#{sentence}\"", params)
@@ -17,10 +25,6 @@ module Plagiarism
17
25
  end
18
26
  end
19
27
 
20
- def exists?(response)
21
- raise
22
- end
23
-
24
28
  def whitelists_regex
25
29
  whitelists = Config.whitelists.map { |w| Regexp.new w }
26
30
  Regexp.union whitelists
@@ -31,11 +35,25 @@ module Plagiarism
31
35
  @content, @params = c, p
32
36
  end
33
37
 
38
+ def retrieve_link(response)
39
+ raise
40
+ end
41
+
34
42
  def unique?
35
43
  ps = PragmaticSegmenter::Segmenter.new(text: content)
36
44
  valid_segments = self.class.valid_segments(ps, params)
37
45
  valid_segments / ps.segment.size >= THRESHOLD
38
46
  end
47
+
48
+ def match
49
+ typhoeus = self.class.fetch("\"#{content}\"", params)
50
+ typhoeus.success? && retrieve_link(typhoeus.response_body)
51
+ end
52
+
53
+ def retrieve_link(response)
54
+ self.class.iterate(response) { |uri| uri.host !~ self.class.whitelists_regex and return uri.to_s }
55
+ end
56
+
39
57
  end
40
58
  end
41
59
  end
@@ -0,0 +1,25 @@
1
+ module Plagiarism
2
+ module Strategies
3
+ class FreeGoogle < Engine
4
+ URL = 'https://ajax.googleapis.com/ajax/services/search/web'
5
+ VERSION = '1.0'
6
+
7
+ class << self
8
+
9
+ def fetch(content, params)
10
+ Typhoeus.get URL, params: params.merge(v: VERSION, q: content, rsz: :large)
11
+ end
12
+
13
+ def iterate(response)
14
+ results = JSON.parse(response)['responseData']['results'] rescue []
15
+ results.all? do |r|
16
+ uri = URI.parse URI::encode(r['unescapedUrl'])
17
+ yield uri
18
+ end
19
+ end
20
+
21
+ end
22
+
23
+ end
24
+ end
25
+ end
@@ -16,11 +16,11 @@ module Plagiarism
16
16
  )
17
17
  end
18
18
 
19
- def exists?(response)
19
+ def iterate(response)
20
20
  results = JSON.parse(response)['items'] || []
21
21
  results.all? do |r|
22
22
  uri = URI.parse URI::encode(r['link'])
23
- uri.host =~ whitelists_regex
23
+ yield uri
24
24
  end
25
25
  end
26
26
 
@@ -9,12 +9,12 @@ module Plagiarism
9
9
  Typhoeus.get(URL, params: params.merge(p: content))
10
10
  end
11
11
 
12
- def exists?(response)
12
+ def iterate(response)
13
13
  doc = Nokogiri::HTML response
14
14
  doc.css('.searchCenterMiddle li').all? do |row|
15
- href = row.at_css('.compTitle div').content.strip rescue ''
16
- uri = URI.parse URI::encode(href =~ /^http/ ? href : 'https://' + href)
17
- uri.host =~ whitelists_regex
15
+ href = row.at_css('.compTitle a').attributes['href'].value rescue ''
16
+ uri = URI.parse URI::encode(href)
17
+ yield uri
18
18
  end
19
19
  end
20
20
 
@@ -1,5 +1,6 @@
1
1
  require 'plagiarism/strategries/engine'
2
2
  require 'plagiarism/strategries/google'
3
+ require 'plagiarism/strategries/free_google'
3
4
  require 'plagiarism/strategries/bing'
4
5
  require 'plagiarism/strategries/duck'
5
6
  require 'plagiarism/strategries/yahoo'
@@ -8,8 +9,8 @@ module Plagiarism
8
9
  module Strategy
9
10
  extend self
10
11
 
11
- def get(name = :google)
12
- Strategies.const_get(name.to_s.sub(/\S/, &:upcase))
12
+ def get(name = :free_google)
13
+ Strategies.const_get(name.to_s.split('_').map(&:capitalize).join(''))
13
14
  end
14
15
 
15
16
  def unique?(content, params)
@@ -18,5 +19,13 @@ module Plagiarism
18
19
  klass.new(content, params).unique?
19
20
  end
20
21
  end
22
+
23
+ def match(content, params)
24
+ Config.strategies.find do |strategy|
25
+ klass = get(strategy)
26
+ link = klass.new(content, params).match
27
+ link and return (link == true ? '' : link)
28
+ end
29
+ end
21
30
  end
22
31
  end
@@ -1,3 +1,3 @@
1
1
  module Plagiarism
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
data/lib/plagiarism.rb CHANGED
@@ -19,4 +19,8 @@ module Plagiarism
19
19
  def unique?(content, params = {})
20
20
  Strategy.unique?(content, params)
21
21
  end
22
+
23
+ def match(content, params = {})
24
+ Strategy.match(content, params)
25
+ end
22
26
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: plagiarism2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - MQuy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-02 00:00:00.000000000 Z
11
+ date: 2016-04-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -146,6 +146,7 @@ files:
146
146
  - lib/plagiarism/strategries/bing.rb
147
147
  - lib/plagiarism/strategries/duck.rb
148
148
  - lib/plagiarism/strategries/engine.rb
149
+ - lib/plagiarism/strategries/free_google.rb
149
150
  - lib/plagiarism/strategries/google.rb
150
151
  - lib/plagiarism/strategries/yahoo.rb
151
152
  - lib/plagiarism/strategy.rb