plagiarism2 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -2
- data/lib/plagiarism/cli.rb +21 -8
- data/lib/plagiarism/strategries/bing.rb +2 -2
- data/lib/plagiarism/strategries/duck.rb +2 -3
- data/lib/plagiarism/strategries/engine.rb +22 -4
- data/lib/plagiarism/strategries/free_google.rb +25 -0
- data/lib/plagiarism/strategries/google.rb +2 -2
- data/lib/plagiarism/strategries/yahoo.rb +4 -4
- data/lib/plagiarism/strategy.rb +11 -2
- data/lib/plagiarism/version.rb +1 -1
- data/lib/plagiarism.rb +4 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a5ae907124afcc6b114cb2fd1a575b7d0a4c5c85
|
4
|
+
data.tar.gz: b6c4a942896b1933ce60a7e3216b8c839bf422a5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bb93fde868664ee036d0fdaa13ee7de53235a372ec41daf5d72109f490b6985094e44afbd8cd189b77693f1a9283d2216010b7c040a5792505874b58e321ffdd
|
7
|
+
data.tar.gz: 93f3cf1b8fed1d7071a80ce4c92383beec0a1b49de3d27f6956c645cc269b8140e4b16d86bfd63384def1ab4ffa8feb9a70b8bf24dfe610f61f120e7b7ea65ef
|
data/README.md
CHANGED
@@ -22,20 +22,24 @@ Add in your config
|
|
22
22
|
|
23
23
|
```ruby
|
24
24
|
Plagiarism.configure do |config|
|
25
|
-
config.strategies = [xxx] # => [:google, :bing, :duck, :yahoo]
|
25
|
+
config.strategies = [xxx] # => [:google, :bing, :duck, :yahoo, :free_google]
|
26
26
|
config.whitelists = ['www.ring.md']
|
27
27
|
end
|
28
28
|
```
|
29
29
|
|
30
|
+
> There is a limit in using duck, yahoo and free google. If you spam request, they will mark you as spam
|
31
|
+
|
30
32
|
+ Using bing engine, you have to set access key (you can get it from [here](https://datamarket.azure.com/dataset/bing/searchweb))
|
31
33
|
|
32
34
|
```ruby
|
35
|
+
config.strategies = :bing
|
33
36
|
config.bing_key = xxx
|
34
37
|
```
|
35
38
|
|
36
|
-
+ Using google engine, you have to set two keys (you can get it from [here](https://developers.google.com/custom-search/json-api/v1/using_rest))
|
39
|
+
+ Using google engine (not free google), you have to set two keys (you can get it from [here](https://developers.google.com/custom-search/json-api/v1/using_rest))
|
37
40
|
|
38
41
|
```ruby
|
42
|
+
config.strategies = :google
|
39
43
|
config.google_key = xxx
|
40
44
|
config.google_cx = xx
|
41
45
|
```
|
data/lib/plagiarism/cli.rb
CHANGED
@@ -6,11 +6,22 @@ module Plagiarism
|
|
6
6
|
include Thor::Actions
|
7
7
|
CONFIG_PATH = '~/.plagiarism.yml'
|
8
8
|
|
9
|
+
class << self
|
10
|
+
def load_config(path)
|
11
|
+
config = YAML.load_file(File.expand_path path)
|
12
|
+
Config.strategies = config['strategies'].split(',').map &:strip
|
13
|
+
Config.whitelists = config['whitelists'].split(',').map &:strip
|
14
|
+
Config.bing_key = config['bing_key']
|
15
|
+
Config.google_key = config['google_key']
|
16
|
+
Config.google_cx = config['google_cx']
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
9
20
|
desc 'init', 'Create file for plagiarism to load config'
|
10
21
|
method_option :path, aliases: '-p', desc: 'where put the config', type: :string, default: CONFIG_PATH
|
11
22
|
def init
|
12
23
|
create_file options[:path], <<-STRING
|
13
|
-
strategies: "
|
24
|
+
strategies: "yahoo"
|
14
25
|
whitelists: "www.ring.md, blog.ring.md"
|
15
26
|
bing_key: xxx
|
16
27
|
google_key: xxx
|
@@ -22,14 +33,16 @@ google_cx: xxx
|
|
22
33
|
method_option :content, aliases: '-c', desc: 'content which need to checked', type: :string, required: true
|
23
34
|
method_option :path, aliases: '-p', desc: 'where put the config', type: :string, default: CONFIG_PATH
|
24
35
|
def unique
|
25
|
-
|
26
|
-
Config.strategies = config['strategies'].split(',').map &:strip
|
27
|
-
Config.whitelists = config['whitelists'].split(',').map &:strip
|
28
|
-
Config.bing_key = config['bing_key']
|
29
|
-
Config.google_key = config['google_key']
|
30
|
-
Config.google_cx = config['google_cx']
|
31
|
-
|
36
|
+
Cli.load_config(options[:path])
|
32
37
|
puts Plagiarism.unique? options[:content]
|
33
38
|
end
|
39
|
+
|
40
|
+
desc 'match', 'Get the first match from search engines'
|
41
|
+
method_option :content, aliases: '-c', desc: 'content which need to get', type: :string, required: true
|
42
|
+
method_option :path, aliases: '-p', desc: 'where put the config', type: :string, default: CONFIG_PATH
|
43
|
+
def match
|
44
|
+
Cli.load_config(options[:path])
|
45
|
+
puts Plagiarism.match options[:content]
|
46
|
+
end
|
34
47
|
end
|
35
48
|
end
|
@@ -9,11 +9,11 @@ module Plagiarism
|
|
9
9
|
Typhoeus.get(URL, params: params.merge('$format' => :json, 'Query' => "'#{content}'"), userpwd: ":#{Config.bing_key}")
|
10
10
|
end
|
11
11
|
|
12
|
-
def
|
12
|
+
def iterate(response)
|
13
13
|
results = JSON.parse(response)['d']['results'] rescue []
|
14
14
|
results.all? do |r|
|
15
15
|
uri = URI.parse URI::encode(r['Url'])
|
16
|
-
uri
|
16
|
+
yield uri
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
@@ -9,17 +9,16 @@ module Plagiarism
|
|
9
9
|
Typhoeus.get(URL, params: params.merge(q: content))
|
10
10
|
end
|
11
11
|
|
12
|
-
def
|
12
|
+
def iterate(response)
|
13
13
|
doc = Nokogiri::HTML response
|
14
14
|
doc.css('.results_links_deep:not(.result--no-result)').all? do |row|
|
15
15
|
href = row.at_css('.result__a').attributes['href'].value rescue ''
|
16
16
|
uri = URI.parse URI::encode(href)
|
17
|
-
uri
|
17
|
+
yield uri
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
21
|
end
|
22
|
-
|
23
22
|
end
|
24
23
|
end
|
25
24
|
end
|
@@ -10,6 +10,14 @@ module Plagiarism
|
|
10
10
|
raise
|
11
11
|
end
|
12
12
|
|
13
|
+
def iterate(r)
|
14
|
+
raise
|
15
|
+
end
|
16
|
+
|
17
|
+
def exists?(response)
|
18
|
+
iterate(response) { |uri| uri.host =~ whitelists_regex }
|
19
|
+
end
|
20
|
+
|
13
21
|
def valid_segments(ps, params)
|
14
22
|
ps.segment.count do |sentence|
|
15
23
|
typhoeus = fetch("\"#{sentence}\"", params)
|
@@ -17,10 +25,6 @@ module Plagiarism
|
|
17
25
|
end
|
18
26
|
end
|
19
27
|
|
20
|
-
def exists?(response)
|
21
|
-
raise
|
22
|
-
end
|
23
|
-
|
24
28
|
def whitelists_regex
|
25
29
|
whitelists = Config.whitelists.map { |w| Regexp.new w }
|
26
30
|
Regexp.union whitelists
|
@@ -31,11 +35,25 @@ module Plagiarism
|
|
31
35
|
@content, @params = c, p
|
32
36
|
end
|
33
37
|
|
38
|
+
def retrieve_link(response)
|
39
|
+
raise
|
40
|
+
end
|
41
|
+
|
34
42
|
def unique?
|
35
43
|
ps = PragmaticSegmenter::Segmenter.new(text: content)
|
36
44
|
valid_segments = self.class.valid_segments(ps, params)
|
37
45
|
valid_segments / ps.segment.size >= THRESHOLD
|
38
46
|
end
|
47
|
+
|
48
|
+
def match
|
49
|
+
typhoeus = self.class.fetch("\"#{content}\"", params)
|
50
|
+
typhoeus.success? && retrieve_link(typhoeus.response_body)
|
51
|
+
end
|
52
|
+
|
53
|
+
def retrieve_link(response)
|
54
|
+
self.class.iterate(response) { |uri| uri.host !~ self.class.whitelists_regex and return uri.to_s }
|
55
|
+
end
|
56
|
+
|
39
57
|
end
|
40
58
|
end
|
41
59
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Plagiarism
|
2
|
+
module Strategies
|
3
|
+
class FreeGoogle < Engine
|
4
|
+
URL = 'https://ajax.googleapis.com/ajax/services/search/web'
|
5
|
+
VERSION = '1.0'
|
6
|
+
|
7
|
+
class << self
|
8
|
+
|
9
|
+
def fetch(content, params)
|
10
|
+
Typhoeus.get URL, params: params.merge(v: VERSION, q: content, rsz: :large)
|
11
|
+
end
|
12
|
+
|
13
|
+
def iterate(response)
|
14
|
+
results = JSON.parse(response)['responseData']['results'] rescue []
|
15
|
+
results.all? do |r|
|
16
|
+
uri = URI.parse URI::encode(r['unescapedUrl'])
|
17
|
+
yield uri
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -16,11 +16,11 @@ module Plagiarism
|
|
16
16
|
)
|
17
17
|
end
|
18
18
|
|
19
|
-
def
|
19
|
+
def iterate(response)
|
20
20
|
results = JSON.parse(response)['items'] || []
|
21
21
|
results.all? do |r|
|
22
22
|
uri = URI.parse URI::encode(r['link'])
|
23
|
-
uri
|
23
|
+
yield uri
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
@@ -9,12 +9,12 @@ module Plagiarism
|
|
9
9
|
Typhoeus.get(URL, params: params.merge(p: content))
|
10
10
|
end
|
11
11
|
|
12
|
-
def
|
12
|
+
def iterate(response)
|
13
13
|
doc = Nokogiri::HTML response
|
14
14
|
doc.css('.searchCenterMiddle li').all? do |row|
|
15
|
-
href = row.at_css('.compTitle
|
16
|
-
uri = URI.parse URI::encode(href
|
17
|
-
uri
|
15
|
+
href = row.at_css('.compTitle a').attributes['href'].value rescue ''
|
16
|
+
uri = URI.parse URI::encode(href)
|
17
|
+
yield uri
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
data/lib/plagiarism/strategy.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'plagiarism/strategries/engine'
|
2
2
|
require 'plagiarism/strategries/google'
|
3
|
+
require 'plagiarism/strategries/free_google'
|
3
4
|
require 'plagiarism/strategries/bing'
|
4
5
|
require 'plagiarism/strategries/duck'
|
5
6
|
require 'plagiarism/strategries/yahoo'
|
@@ -8,8 +9,8 @@ module Plagiarism
|
|
8
9
|
module Strategy
|
9
10
|
extend self
|
10
11
|
|
11
|
-
def get(name = :
|
12
|
-
Strategies.const_get(name.to_s.
|
12
|
+
def get(name = :free_google)
|
13
|
+
Strategies.const_get(name.to_s.split('_').map(&:capitalize).join(''))
|
13
14
|
end
|
14
15
|
|
15
16
|
def unique?(content, params)
|
@@ -18,5 +19,13 @@ module Plagiarism
|
|
18
19
|
klass.new(content, params).unique?
|
19
20
|
end
|
20
21
|
end
|
22
|
+
|
23
|
+
def match(content, params)
|
24
|
+
Config.strategies.find do |strategy|
|
25
|
+
klass = get(strategy)
|
26
|
+
link = klass.new(content, params).match
|
27
|
+
link and return (link == true ? '' : link)
|
28
|
+
end
|
29
|
+
end
|
21
30
|
end
|
22
31
|
end
|
data/lib/plagiarism/version.rb
CHANGED
data/lib/plagiarism.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: plagiarism2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- MQuy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-04-
|
11
|
+
date: 2016-04-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -146,6 +146,7 @@ files:
|
|
146
146
|
- lib/plagiarism/strategries/bing.rb
|
147
147
|
- lib/plagiarism/strategries/duck.rb
|
148
148
|
- lib/plagiarism/strategries/engine.rb
|
149
|
+
- lib/plagiarism/strategries/free_google.rb
|
149
150
|
- lib/plagiarism/strategries/google.rb
|
150
151
|
- lib/plagiarism/strategries/yahoo.rb
|
151
152
|
- lib/plagiarism/strategy.rb
|