plagiarism2 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -2
- data/lib/plagiarism/cli.rb +21 -8
- data/lib/plagiarism/strategries/bing.rb +2 -2
- data/lib/plagiarism/strategries/duck.rb +2 -3
- data/lib/plagiarism/strategries/engine.rb +22 -4
- data/lib/plagiarism/strategries/free_google.rb +25 -0
- data/lib/plagiarism/strategries/google.rb +2 -2
- data/lib/plagiarism/strategries/yahoo.rb +4 -4
- data/lib/plagiarism/strategy.rb +11 -2
- data/lib/plagiarism/version.rb +1 -1
- data/lib/plagiarism.rb +4 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a5ae907124afcc6b114cb2fd1a575b7d0a4c5c85
|
4
|
+
data.tar.gz: b6c4a942896b1933ce60a7e3216b8c839bf422a5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bb93fde868664ee036d0fdaa13ee7de53235a372ec41daf5d72109f490b6985094e44afbd8cd189b77693f1a9283d2216010b7c040a5792505874b58e321ffdd
|
7
|
+
data.tar.gz: 93f3cf1b8fed1d7071a80ce4c92383beec0a1b49de3d27f6956c645cc269b8140e4b16d86bfd63384def1ab4ffa8feb9a70b8bf24dfe610f61f120e7b7ea65ef
|
data/README.md
CHANGED
@@ -22,20 +22,24 @@ Add in your config
|
|
22
22
|
|
23
23
|
```ruby
|
24
24
|
Plagiarism.configure do |config|
|
25
|
-
config.strategies = [xxx] # => [:google, :bing, :duck, :yahoo]
|
25
|
+
config.strategies = [xxx] # => [:google, :bing, :duck, :yahoo, :free_google]
|
26
26
|
config.whitelists = ['www.ring.md']
|
27
27
|
end
|
28
28
|
```
|
29
29
|
|
30
|
+
> There is a limit in using duck, yahoo and free google. If you spam request, they will mark you as spam
|
31
|
+
|
30
32
|
+ Using bing engine, you have to set access key (you can get it from [here](https://datamarket.azure.com/dataset/bing/searchweb))
|
31
33
|
|
32
34
|
```ruby
|
35
|
+
config.strategies = :bing
|
33
36
|
config.bing_key = xxx
|
34
37
|
```
|
35
38
|
|
36
|
-
+ Using google engine, you have to set two keys (you can get it from [here](https://developers.google.com/custom-search/json-api/v1/using_rest))
|
39
|
+
+ Using google engine (not free google), you have to set two keys (you can get it from [here](https://developers.google.com/custom-search/json-api/v1/using_rest))
|
37
40
|
|
38
41
|
```ruby
|
42
|
+
config.strategies = :google
|
39
43
|
config.google_key = xxx
|
40
44
|
config.google_cx = xx
|
41
45
|
```
|
data/lib/plagiarism/cli.rb
CHANGED
@@ -6,11 +6,22 @@ module Plagiarism
|
|
6
6
|
include Thor::Actions
|
7
7
|
CONFIG_PATH = '~/.plagiarism.yml'
|
8
8
|
|
9
|
+
class << self
|
10
|
+
def load_config(path)
|
11
|
+
config = YAML.load_file(File.expand_path path)
|
12
|
+
Config.strategies = config['strategies'].split(',').map &:strip
|
13
|
+
Config.whitelists = config['whitelists'].split(',').map &:strip
|
14
|
+
Config.bing_key = config['bing_key']
|
15
|
+
Config.google_key = config['google_key']
|
16
|
+
Config.google_cx = config['google_cx']
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
9
20
|
desc 'init', 'Create file for plagiarism to load config'
|
10
21
|
method_option :path, aliases: '-p', desc: 'where put the config', type: :string, default: CONFIG_PATH
|
11
22
|
def init
|
12
23
|
create_file options[:path], <<-STRING
|
13
|
-
strategies: "
|
24
|
+
strategies: "yahoo"
|
14
25
|
whitelists: "www.ring.md, blog.ring.md"
|
15
26
|
bing_key: xxx
|
16
27
|
google_key: xxx
|
@@ -22,14 +33,16 @@ google_cx: xxx
|
|
22
33
|
method_option :content, aliases: '-c', desc: 'content which need to checked', type: :string, required: true
|
23
34
|
method_option :path, aliases: '-p', desc: 'where put the config', type: :string, default: CONFIG_PATH
|
24
35
|
def unique
|
25
|
-
|
26
|
-
Config.strategies = config['strategies'].split(',').map &:strip
|
27
|
-
Config.whitelists = config['whitelists'].split(',').map &:strip
|
28
|
-
Config.bing_key = config['bing_key']
|
29
|
-
Config.google_key = config['google_key']
|
30
|
-
Config.google_cx = config['google_cx']
|
31
|
-
|
36
|
+
Cli.load_config(options[:path])
|
32
37
|
puts Plagiarism.unique? options[:content]
|
33
38
|
end
|
39
|
+
|
40
|
+
desc 'match', 'Get the first match from search engines'
|
41
|
+
method_option :content, aliases: '-c', desc: 'content which need to get', type: :string, required: true
|
42
|
+
method_option :path, aliases: '-p', desc: 'where put the config', type: :string, default: CONFIG_PATH
|
43
|
+
def match
|
44
|
+
Cli.load_config(options[:path])
|
45
|
+
puts Plagiarism.match options[:content]
|
46
|
+
end
|
34
47
|
end
|
35
48
|
end
|
@@ -9,11 +9,11 @@ module Plagiarism
|
|
9
9
|
Typhoeus.get(URL, params: params.merge('$format' => :json, 'Query' => "'#{content}'"), userpwd: ":#{Config.bing_key}")
|
10
10
|
end
|
11
11
|
|
12
|
-
def
|
12
|
+
def iterate(response)
|
13
13
|
results = JSON.parse(response)['d']['results'] rescue []
|
14
14
|
results.all? do |r|
|
15
15
|
uri = URI.parse URI::encode(r['Url'])
|
16
|
-
uri
|
16
|
+
yield uri
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
@@ -9,17 +9,16 @@ module Plagiarism
|
|
9
9
|
Typhoeus.get(URL, params: params.merge(q: content))
|
10
10
|
end
|
11
11
|
|
12
|
-
def
|
12
|
+
def iterate(response)
|
13
13
|
doc = Nokogiri::HTML response
|
14
14
|
doc.css('.results_links_deep:not(.result--no-result)').all? do |row|
|
15
15
|
href = row.at_css('.result__a').attributes['href'].value rescue ''
|
16
16
|
uri = URI.parse URI::encode(href)
|
17
|
-
uri
|
17
|
+
yield uri
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
21
|
end
|
22
|
-
|
23
22
|
end
|
24
23
|
end
|
25
24
|
end
|
@@ -10,6 +10,14 @@ module Plagiarism
|
|
10
10
|
raise
|
11
11
|
end
|
12
12
|
|
13
|
+
def iterate(r)
|
14
|
+
raise
|
15
|
+
end
|
16
|
+
|
17
|
+
def exists?(response)
|
18
|
+
iterate(response) { |uri| uri.host =~ whitelists_regex }
|
19
|
+
end
|
20
|
+
|
13
21
|
def valid_segments(ps, params)
|
14
22
|
ps.segment.count do |sentence|
|
15
23
|
typhoeus = fetch("\"#{sentence}\"", params)
|
@@ -17,10 +25,6 @@ module Plagiarism
|
|
17
25
|
end
|
18
26
|
end
|
19
27
|
|
20
|
-
def exists?(response)
|
21
|
-
raise
|
22
|
-
end
|
23
|
-
|
24
28
|
def whitelists_regex
|
25
29
|
whitelists = Config.whitelists.map { |w| Regexp.new w }
|
26
30
|
Regexp.union whitelists
|
@@ -31,11 +35,25 @@ module Plagiarism
|
|
31
35
|
@content, @params = c, p
|
32
36
|
end
|
33
37
|
|
38
|
+
def retrieve_link(response)
|
39
|
+
raise
|
40
|
+
end
|
41
|
+
|
34
42
|
def unique?
|
35
43
|
ps = PragmaticSegmenter::Segmenter.new(text: content)
|
36
44
|
valid_segments = self.class.valid_segments(ps, params)
|
37
45
|
valid_segments / ps.segment.size >= THRESHOLD
|
38
46
|
end
|
47
|
+
|
48
|
+
def match
|
49
|
+
typhoeus = self.class.fetch("\"#{content}\"", params)
|
50
|
+
typhoeus.success? && retrieve_link(typhoeus.response_body)
|
51
|
+
end
|
52
|
+
|
53
|
+
def retrieve_link(response)
|
54
|
+
self.class.iterate(response) { |uri| uri.host !~ self.class.whitelists_regex and return uri.to_s }
|
55
|
+
end
|
56
|
+
|
39
57
|
end
|
40
58
|
end
|
41
59
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Plagiarism
|
2
|
+
module Strategies
|
3
|
+
class FreeGoogle < Engine
|
4
|
+
URL = 'https://ajax.googleapis.com/ajax/services/search/web'
|
5
|
+
VERSION = '1.0'
|
6
|
+
|
7
|
+
class << self
|
8
|
+
|
9
|
+
def fetch(content, params)
|
10
|
+
Typhoeus.get URL, params: params.merge(v: VERSION, q: content, rsz: :large)
|
11
|
+
end
|
12
|
+
|
13
|
+
def iterate(response)
|
14
|
+
results = JSON.parse(response)['responseData']['results'] rescue []
|
15
|
+
results.all? do |r|
|
16
|
+
uri = URI.parse URI::encode(r['unescapedUrl'])
|
17
|
+
yield uri
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -16,11 +16,11 @@ module Plagiarism
|
|
16
16
|
)
|
17
17
|
end
|
18
18
|
|
19
|
-
def
|
19
|
+
def iterate(response)
|
20
20
|
results = JSON.parse(response)['items'] || []
|
21
21
|
results.all? do |r|
|
22
22
|
uri = URI.parse URI::encode(r['link'])
|
23
|
-
uri
|
23
|
+
yield uri
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
@@ -9,12 +9,12 @@ module Plagiarism
|
|
9
9
|
Typhoeus.get(URL, params: params.merge(p: content))
|
10
10
|
end
|
11
11
|
|
12
|
-
def
|
12
|
+
def iterate(response)
|
13
13
|
doc = Nokogiri::HTML response
|
14
14
|
doc.css('.searchCenterMiddle li').all? do |row|
|
15
|
-
href = row.at_css('.compTitle
|
16
|
-
uri = URI.parse URI::encode(href
|
17
|
-
uri
|
15
|
+
href = row.at_css('.compTitle a').attributes['href'].value rescue ''
|
16
|
+
uri = URI.parse URI::encode(href)
|
17
|
+
yield uri
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
data/lib/plagiarism/strategy.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'plagiarism/strategries/engine'
|
2
2
|
require 'plagiarism/strategries/google'
|
3
|
+
require 'plagiarism/strategries/free_google'
|
3
4
|
require 'plagiarism/strategries/bing'
|
4
5
|
require 'plagiarism/strategries/duck'
|
5
6
|
require 'plagiarism/strategries/yahoo'
|
@@ -8,8 +9,8 @@ module Plagiarism
|
|
8
9
|
module Strategy
|
9
10
|
extend self
|
10
11
|
|
11
|
-
def get(name = :
|
12
|
-
Strategies.const_get(name.to_s.
|
12
|
+
def get(name = :free_google)
|
13
|
+
Strategies.const_get(name.to_s.split('_').map(&:capitalize).join(''))
|
13
14
|
end
|
14
15
|
|
15
16
|
def unique?(content, params)
|
@@ -18,5 +19,13 @@ module Plagiarism
|
|
18
19
|
klass.new(content, params).unique?
|
19
20
|
end
|
20
21
|
end
|
22
|
+
|
23
|
+
def match(content, params)
|
24
|
+
Config.strategies.find do |strategy|
|
25
|
+
klass = get(strategy)
|
26
|
+
link = klass.new(content, params).match
|
27
|
+
link and return (link == true ? '' : link)
|
28
|
+
end
|
29
|
+
end
|
21
30
|
end
|
22
31
|
end
|
data/lib/plagiarism/version.rb
CHANGED
data/lib/plagiarism.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: plagiarism2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- MQuy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-04-
|
11
|
+
date: 2016-04-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -146,6 +146,7 @@ files:
|
|
146
146
|
- lib/plagiarism/strategries/bing.rb
|
147
147
|
- lib/plagiarism/strategries/duck.rb
|
148
148
|
- lib/plagiarism/strategries/engine.rb
|
149
|
+
- lib/plagiarism/strategries/free_google.rb
|
149
150
|
- lib/plagiarism/strategries/google.rb
|
150
151
|
- lib/plagiarism/strategries/yahoo.rb
|
151
152
|
- lib/plagiarism/strategy.rb
|