plagiarism2 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bff4f9ef7060e99ba754a8e85d88ded43bc92234
4
- data.tar.gz: fb32b027c73191b7724644a4f169db41bd93135b
3
+ metadata.gz: a5ae907124afcc6b114cb2fd1a575b7d0a4c5c85
4
+ data.tar.gz: b6c4a942896b1933ce60a7e3216b8c839bf422a5
5
5
  SHA512:
6
- metadata.gz: c919a74db75b832676a998db8a5084cd7b93c2da18d171178ad98091f7d363bb4058399a397e8bacc401fd61f00b64d7133ab4c8a7421c5483e79eef99e4d4bc
7
- data.tar.gz: a38c0840f60715c6c32bdcb8673d2217128d5d945c4bbb6133addc8b1a119bace2aa951f5fa3675cf04bbcf01953651a4cb67ec97fa1beb7b4a7021032557d13
6
+ metadata.gz: bb93fde868664ee036d0fdaa13ee7de53235a372ec41daf5d72109f490b6985094e44afbd8cd189b77693f1a9283d2216010b7c040a5792505874b58e321ffdd
7
+ data.tar.gz: 93f3cf1b8fed1d7071a80ce4c92383beec0a1b49de3d27f6956c645cc269b8140e4b16d86bfd63384def1ab4ffa8feb9a70b8bf24dfe610f61f120e7b7ea65ef
data/README.md CHANGED
@@ -22,20 +22,24 @@ Add in your config
22
22
 
23
23
  ```ruby
24
24
  Plagiarism.configure do |config|
25
- config.strategies = [xxx] # => [:google, :bing, :duck, :yahoo]
25
+ config.strategies = [xxx] # => [:google, :bing, :duck, :yahoo, :free_google]
26
26
  config.whitelists = ['www.ring.md']
27
27
  end
28
28
  ```
29
29
 
30
+ > There is a limit in using duck, yahoo and free google. If you spam request, they will mark you as spam
31
+
30
32
  + Using bing engine, you have to set access key (you can get it from [here](https://datamarket.azure.com/dataset/bing/searchweb))
31
33
 
32
34
  ```ruby
35
+ config.strategies = :bing
33
36
  config.bing_key = xxx
34
37
  ```
35
38
 
36
- + Using google engine, you have to set two keys (you can get it from [here](https://developers.google.com/custom-search/json-api/v1/using_rest))
39
+ + Using google engine (not free google), you have to set two keys (you can get it from [here](https://developers.google.com/custom-search/json-api/v1/using_rest))
37
40
 
38
41
  ```ruby
42
+ config.strategies = :google
39
43
  config.google_key = xxx
40
44
  config.google_cx = xx
41
45
  ```
@@ -6,11 +6,22 @@ module Plagiarism
6
6
  include Thor::Actions
7
7
  CONFIG_PATH = '~/.plagiarism.yml'
8
8
 
9
+ class << self
10
+ def load_config(path)
11
+ config = YAML.load_file(File.expand_path path)
12
+ Config.strategies = config['strategies'].split(',').map &:strip
13
+ Config.whitelists = config['whitelists'].split(',').map &:strip
14
+ Config.bing_key = config['bing_key']
15
+ Config.google_key = config['google_key']
16
+ Config.google_cx = config['google_cx']
17
+ end
18
+ end
19
+
9
20
  desc 'init', 'Create file for plagiarism to load config'
10
21
  method_option :path, aliases: '-p', desc: 'where put the config', type: :string, default: CONFIG_PATH
11
22
  def init
12
23
  create_file options[:path], <<-STRING
13
- strategies: "duck, yahoo"
24
+ strategies: "yahoo"
14
25
  whitelists: "www.ring.md, blog.ring.md"
15
26
  bing_key: xxx
16
27
  google_key: xxx
@@ -22,14 +33,16 @@ google_cx: xxx
22
33
  method_option :content, aliases: '-c', desc: 'content which need to checked', type: :string, required: true
23
34
  method_option :path, aliases: '-p', desc: 'where put the config', type: :string, default: CONFIG_PATH
24
35
  def unique
25
- config = YAML.load_file(File.expand_path options[:path])
26
- Config.strategies = config['strategies'].split(',').map &:strip
27
- Config.whitelists = config['whitelists'].split(',').map &:strip
28
- Config.bing_key = config['bing_key']
29
- Config.google_key = config['google_key']
30
- Config.google_cx = config['google_cx']
31
-
36
+ Cli.load_config(options[:path])
32
37
  puts Plagiarism.unique? options[:content]
33
38
  end
39
+
40
+ desc 'match', 'Get the first match from search engines'
41
+ method_option :content, aliases: '-c', desc: 'content which need to get', type: :string, required: true
42
+ method_option :path, aliases: '-p', desc: 'where put the config', type: :string, default: CONFIG_PATH
43
+ def match
44
+ Cli.load_config(options[:path])
45
+ puts Plagiarism.match options[:content]
46
+ end
34
47
  end
35
48
  end
@@ -9,11 +9,11 @@ module Plagiarism
9
9
  Typhoeus.get(URL, params: params.merge('$format' => :json, 'Query' => "'#{content}'"), userpwd: ":#{Config.bing_key}")
10
10
  end
11
11
 
12
- def exists?(response)
12
+ def iterate(response)
13
13
  results = JSON.parse(response)['d']['results'] rescue []
14
14
  results.all? do |r|
15
15
  uri = URI.parse URI::encode(r['Url'])
16
- uri.host =~ whitelists_regex
16
+ yield uri
17
17
  end
18
18
  end
19
19
 
@@ -9,17 +9,16 @@ module Plagiarism
9
9
  Typhoeus.get(URL, params: params.merge(q: content))
10
10
  end
11
11
 
12
- def exists?(response)
12
+ def iterate(response)
13
13
  doc = Nokogiri::HTML response
14
14
  doc.css('.results_links_deep:not(.result--no-result)').all? do |row|
15
15
  href = row.at_css('.result__a').attributes['href'].value rescue ''
16
16
  uri = URI.parse URI::encode(href)
17
- uri.host =~ whitelists_regex
17
+ yield uri
18
18
  end
19
19
  end
20
20
 
21
21
  end
22
-
23
22
  end
24
23
  end
25
24
  end
@@ -10,6 +10,14 @@ module Plagiarism
10
10
  raise
11
11
  end
12
12
 
13
+ def iterate(r)
14
+ raise
15
+ end
16
+
17
+ def exists?(response)
18
+ iterate(response) { |uri| uri.host =~ whitelists_regex }
19
+ end
20
+
13
21
  def valid_segments(ps, params)
14
22
  ps.segment.count do |sentence|
15
23
  typhoeus = fetch("\"#{sentence}\"", params)
@@ -17,10 +25,6 @@ module Plagiarism
17
25
  end
18
26
  end
19
27
 
20
- def exists?(response)
21
- raise
22
- end
23
-
24
28
  def whitelists_regex
25
29
  whitelists = Config.whitelists.map { |w| Regexp.new w }
26
30
  Regexp.union whitelists
@@ -31,11 +35,25 @@ module Plagiarism
31
35
  @content, @params = c, p
32
36
  end
33
37
 
38
+ def retrieve_link(response)
39
+ raise
40
+ end
41
+
34
42
  def unique?
35
43
  ps = PragmaticSegmenter::Segmenter.new(text: content)
36
44
  valid_segments = self.class.valid_segments(ps, params)
37
45
  valid_segments / ps.segment.size >= THRESHOLD
38
46
  end
47
+
48
+ def match
49
+ typhoeus = self.class.fetch("\"#{content}\"", params)
50
+ typhoeus.success? && retrieve_link(typhoeus.response_body)
51
+ end
52
+
53
+ def retrieve_link(response)
54
+ self.class.iterate(response) { |uri| uri.host !~ self.class.whitelists_regex and return uri.to_s }
55
+ end
56
+
39
57
  end
40
58
  end
41
59
  end
@@ -0,0 +1,25 @@
1
+ module Plagiarism
2
+ module Strategies
3
+ class FreeGoogle < Engine
4
+ URL = 'https://ajax.googleapis.com/ajax/services/search/web'
5
+ VERSION = '1.0'
6
+
7
+ class << self
8
+
9
+ def fetch(content, params)
10
+ Typhoeus.get URL, params: params.merge(v: VERSION, q: content, rsz: :large)
11
+ end
12
+
13
+ def iterate(response)
14
+ results = JSON.parse(response)['responseData']['results'] rescue []
15
+ results.all? do |r|
16
+ uri = URI.parse URI::encode(r['unescapedUrl'])
17
+ yield uri
18
+ end
19
+ end
20
+
21
+ end
22
+
23
+ end
24
+ end
25
+ end
@@ -16,11 +16,11 @@ module Plagiarism
16
16
  )
17
17
  end
18
18
 
19
- def exists?(response)
19
+ def iterate(response)
20
20
  results = JSON.parse(response)['items'] || []
21
21
  results.all? do |r|
22
22
  uri = URI.parse URI::encode(r['link'])
23
- uri.host =~ whitelists_regex
23
+ yield uri
24
24
  end
25
25
  end
26
26
 
@@ -9,12 +9,12 @@ module Plagiarism
9
9
  Typhoeus.get(URL, params: params.merge(p: content))
10
10
  end
11
11
 
12
- def exists?(response)
12
+ def iterate(response)
13
13
  doc = Nokogiri::HTML response
14
14
  doc.css('.searchCenterMiddle li').all? do |row|
15
- href = row.at_css('.compTitle div').content.strip rescue ''
16
- uri = URI.parse URI::encode(href =~ /^http/ ? href : 'https://' + href)
17
- uri.host =~ whitelists_regex
15
+ href = row.at_css('.compTitle a').attributes['href'].value rescue ''
16
+ uri = URI.parse URI::encode(href)
17
+ yield uri
18
18
  end
19
19
  end
20
20
 
@@ -1,5 +1,6 @@
1
1
  require 'plagiarism/strategries/engine'
2
2
  require 'plagiarism/strategries/google'
3
+ require 'plagiarism/strategries/free_google'
3
4
  require 'plagiarism/strategries/bing'
4
5
  require 'plagiarism/strategries/duck'
5
6
  require 'plagiarism/strategries/yahoo'
@@ -8,8 +9,8 @@ module Plagiarism
8
9
  module Strategy
9
10
  extend self
10
11
 
11
- def get(name = :google)
12
- Strategies.const_get(name.to_s.sub(/\S/, &:upcase))
12
+ def get(name = :free_google)
13
+ Strategies.const_get(name.to_s.split('_').map(&:capitalize).join(''))
13
14
  end
14
15
 
15
16
  def unique?(content, params)
@@ -18,5 +19,13 @@ module Plagiarism
18
19
  klass.new(content, params).unique?
19
20
  end
20
21
  end
22
+
23
+ def match(content, params)
24
+ Config.strategies.find do |strategy|
25
+ klass = get(strategy)
26
+ link = klass.new(content, params).match
27
+ link and return (link == true ? '' : link)
28
+ end
29
+ end
21
30
  end
22
31
  end
@@ -1,3 +1,3 @@
1
1
  module Plagiarism
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
data/lib/plagiarism.rb CHANGED
@@ -19,4 +19,8 @@ module Plagiarism
19
19
  def unique?(content, params = {})
20
20
  Strategy.unique?(content, params)
21
21
  end
22
+
23
+ def match(content, params = {})
24
+ Strategy.match(content, params)
25
+ end
22
26
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: plagiarism2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - MQuy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-02 00:00:00.000000000 Z
11
+ date: 2016-04-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -146,6 +146,7 @@ files:
146
146
  - lib/plagiarism/strategries/bing.rb
147
147
  - lib/plagiarism/strategries/duck.rb
148
148
  - lib/plagiarism/strategries/engine.rb
149
+ - lib/plagiarism/strategries/free_google.rb
149
150
  - lib/plagiarism/strategries/google.rb
150
151
  - lib/plagiarism/strategries/yahoo.rb
151
152
  - lib/plagiarism/strategy.rb