plagiarism2 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1af144aabf96e33a5ec59209dc16a798eb46f89b
4
- data.tar.gz: 32a0aa5a9eb659c7c788eb10bb6cdfb0990771af
3
+ metadata.gz: edb2847694cb3d1c3f8d827b5e67ffbe77b8b9ed
4
+ data.tar.gz: 6689d076612acf8b6c803e098b5467dd19ba41f6
5
5
  SHA512:
6
- metadata.gz: 172b035e164062011de6133cf9ef773cafc6e43ab8859ee7731cd9a642a99f27e33c25beff740b05b293faf7e6281ba03493f0e15254d4de2f390aa1383938fd
7
- data.tar.gz: 8a5a00d48dc1f11198215626692c9af94bae13e83bf80904f3576d56ecd6dfafcad890cd7510eb4164ddba01c61f6fb21d2796875af8fb15e660783d525e0e66
6
+ metadata.gz: f2f82df5e758d90bd19ec427cd091f82626c47db8cf45974e9efdf56b6c088a2372213b303be53dcebb4164f06d7075b15845f9467013ee34ffc1c42db0ca33c
7
+ data.tar.gz: 6f331bfc6199816bf36ecdbd8f4413e605729fa89872f35322bd8ba7993a02e42726a492186d2f3c44b720d607def80a5ba72516938d5dc34b1c84f112d129c5
data/README.md CHANGED
@@ -27,12 +27,19 @@ Plagiarism.configure do |config|
27
27
  end
28
28
  ```
29
29
 
30
- In case of using bing engine, you have to set access key
30
+ + Using bing engine, you have to set access key (you can get it from [here](https://datamarket.azure.com/dataset/bing/searchweb))
31
31
 
32
32
  ```ruby
33
33
  config.bing_key = xxx
34
34
  ```
35
35
 
36
+ + Using google engine, you have to set two keys (you can get it from [here](https://developers.google.com/custom-search/json-api/v1/using_rest))
37
+
38
+ ```ruby
39
+ config.google_key = xxx
40
+ config.google_cx = xx
41
+ ```
42
+
36
43
  After that you can check the unique of content
37
44
 
38
45
  ```ruby
@@ -46,7 +53,7 @@ Plagiarism.unique? text
46
53
 
47
54
  Bug reports and pull requests are welcome on GitHub at https://github.com/MQuy/plagiarism. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
48
55
 
49
- > Disclaim: Yahoo and DuckDuckGo don't support api, therefore I have to crawl data, if you find any solution to fix, please help me.
56
+ > Disclaim: Yahoo and DuckDuckGo don't support api, therefore plagiarism has to crawl data and they will mark plagiarism as spam as we request too much, if you find any better solution, please help me.
50
57
 
51
58
  ## License
52
59
 
@@ -4,6 +4,10 @@ module Plagiarism
4
4
 
5
5
  attr_accessor :strategies
6
6
  attr_accessor :whitelists
7
+
7
8
  attr_accessor :bing_key
9
+
10
+ attr_accessor :google_key
11
+ attr_accessor :google_cx
8
12
  end
9
13
  end
@@ -10,8 +10,9 @@ module Plagiarism
10
10
  end
11
11
 
12
12
  def exists?(response)
13
- JSON.parse(response)['d']['results'].all? do |r|
14
- uri = URI.parse(r['Url'])
13
+ results = JSON.parse(response)['d']['results'] rescue []
14
+ results.all? do |r|
15
+ uri = URI.parse URI::encode(r['Url'])
15
16
  uri.host =~ whitelists_regex
16
17
  end
17
18
  end
@@ -13,7 +13,7 @@ module Plagiarism
13
13
  doc = Nokogiri::HTML response
14
14
  doc.css('.results_links_deep:not(.result--no-result)').all? do |row|
15
15
  href = row.at_css('.result__a').attributes['href'].value rescue ''
16
- uri = URI.parse href
16
+ uri = URI.parse URI::encode(href)
17
17
  uri.host =~ whitelists_regex
18
18
  end
19
19
  end
@@ -13,7 +13,7 @@ module Plagiarism
13
13
  def valid_segments(ps, params)
14
14
  ps.segment.count do |sentence|
15
15
  typhoeus = fetch("\"#{sentence}\"", params)
16
- exists?(typhoeus.response_body)
16
+ typhoeus.success? && exists?(typhoeus.response_body)
17
17
  end
18
18
  end
19
19
 
@@ -1,18 +1,25 @@
1
1
  module Plagiarism
2
2
  module Strategies
3
3
  class Google < Engine
4
- URL = 'https://ajax.googleapis.com/ajax/services/search/web'
4
+ URL = 'https://www.googleapis.com/customsearch/v1'
5
5
  VERSION = '1.0'
6
6
 
7
7
  class << self
8
8
 
9
9
  def fetch(content, params)
10
- Typhoeus.get URL, params: params.merge(v: VERSION, q: content, rsz: :large)
10
+ Typhoeus.get URL, params: params.merge(
11
+ key: Config.google_key,
12
+ cx: Config.google_cx,
13
+ q: content,
14
+ fields: 'items(link)',
15
+ prettyPrint: false
16
+ )
11
17
  end
12
18
 
13
19
  def exists?(response)
14
- JSON.parse(response)['responseData']['results'].all? do |r|
15
- uri = URI.parse(r['unescapedUrl'])
20
+ results = JSON.parse(response)['items'] || []
21
+ results.all? do |r|
22
+ uri = URI.parse URI::encode(r['link'])
16
23
  uri.host =~ whitelists_regex
17
24
  end
18
25
  end
@@ -13,7 +13,7 @@ module Plagiarism
13
13
  doc = Nokogiri::HTML response
14
14
  doc.css('.searchCenterMiddle li').all? do |row|
15
15
  href = row.at_css('.compTitle div').content.strip rescue ''
16
- uri = URI.parse(href =~ /^http/ ? href : 'https://' + href)
16
+ uri = URI.parse URI::encode(href =~ /^http/ ? href : 'https://' + href)
17
17
  uri.host =~ whitelists_regex
18
18
  end
19
19
  end
@@ -1,3 +1,3 @@
1
1
  module Plagiarism
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: plagiarism2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - MQuy
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-04-01 00:00:00.000000000 Z
11
+ date: 2016-04-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler