nebrija 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +9 -0
- data/bin/nebrija +9 -0
- data/lib/nebrija/parser.rb +75 -0
- data/lib/nebrija.rb +64 -0
- data/test/test_basic.rb +63 -0
- metadata +100 -0
data/Rakefile
ADDED
data/bin/nebrija
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
class Parser
|
4
|
+
|
5
|
+
META_REGEX = /^([a-zA-Z]{1,4}+\.[ ]{1,2})+/
|
6
|
+
|
7
|
+
def initialize(rae_data, word)
|
8
|
+
@doc = Nokogiri::HTML(rae_data
|
9
|
+
.gsub(/[\n]+/, '')
|
10
|
+
.gsub(/[ ]{2,}+/, ' '))
|
11
|
+
@word = word
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse
|
15
|
+
|
16
|
+
return {:error => 'Word does not exist. Sorry.'} if !valid?
|
17
|
+
|
18
|
+
if single?
|
19
|
+
parse_single
|
20
|
+
else
|
21
|
+
parse_multiple
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def single?
|
26
|
+
@doc.css('body > ul').length.zero?
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
def parse_single
|
31
|
+
data = []
|
32
|
+
result = {:id => @doc.css('body > div > a').first['name'].to_i, :data => data}
|
33
|
+
state = :entry # TODO. Improve FSM syntax.
|
34
|
+
index = -1 # HACK(javierhonduco)
|
35
|
+
|
36
|
+
@doc.css('body > div > p').each do |entry|
|
37
|
+
if entry['class'] == 'p' and state == :entry
|
38
|
+
word = entry.css('span').inner_text
|
39
|
+
word = '=>' if word == ''
|
40
|
+
data << {
|
41
|
+
:word => word.gsub(/~/, @word).strip.capitalize,
|
42
|
+
:meanings => []
|
43
|
+
}
|
44
|
+
index+=1
|
45
|
+
else
|
46
|
+
text = entry.inner_text.strip.gsub(/[0-9]+\.[ ]/, '')
|
47
|
+
next if text[0] == '(' # Del latín, Nil.
|
48
|
+
unparsed_meta = text.scan META_REGEX
|
49
|
+
text = text.gsub(META_REGEX, '')
|
50
|
+
data[index][:meanings] << {
|
51
|
+
:word => text,
|
52
|
+
:meta => (unparsed_meta.join.strip if unparsed_meta.join.strip != ''),
|
53
|
+
} if !text.nil? and text != ''
|
54
|
+
state = :definitions
|
55
|
+
end
|
56
|
+
state = :entry
|
57
|
+
end
|
58
|
+
result
|
59
|
+
end
|
60
|
+
|
61
|
+
def parse_multiple
|
62
|
+
multiple_result = []
|
63
|
+
@doc.css('body > ul > li > a').each do |word|
|
64
|
+
multiple_result << {
|
65
|
+
:word => word.css('span').first.inner_text,
|
66
|
+
:href => word['href'].gsub(/search\?id=/, '')
|
67
|
+
}
|
68
|
+
end
|
69
|
+
multiple_result
|
70
|
+
end
|
71
|
+
|
72
|
+
def valid?
|
73
|
+
(@doc.css('title').inner_text =~/error/).nil?
|
74
|
+
end
|
75
|
+
end
|
data/lib/nebrija.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'nebrija/parser'
|
2
|
+
require 'typhoeus'
|
3
|
+
|
4
|
+
|
5
|
+
class Rae
|
6
|
+
|
7
|
+
def search(word)
|
8
|
+
Parser.new(query(word), word).parse
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
def query(word)
|
13
|
+
raise 'NotImplementedError'
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
class FileRae < Rae
|
19
|
+
|
20
|
+
private
|
21
|
+
def query(file)
|
22
|
+
IO.read(file)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
class HTTPRae < Rae
|
28
|
+
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'
|
29
|
+
SEARCH_URL = 'http://lema.rae.es/drae/srv/search?'
|
30
|
+
REQUEST_TIMEOUT =
|
31
|
+
ID_REGEX = /[0-9]/
|
32
|
+
|
33
|
+
private
|
34
|
+
def query(word)
|
35
|
+
@word = word
|
36
|
+
|
37
|
+
params = 'id='
|
38
|
+
params = 'val=' if val?
|
39
|
+
|
40
|
+
response = Typhoeus::Request.post(
|
41
|
+
"http://lema.rae.es/drae/srv/search?#{params}#{word}",
|
42
|
+
body: build_headers
|
43
|
+
)
|
44
|
+
response.body
|
45
|
+
end
|
46
|
+
|
47
|
+
def val?
|
48
|
+
(@word =~ ID_REGEX).nil?
|
49
|
+
end
|
50
|
+
|
51
|
+
def build_headers
|
52
|
+
{
|
53
|
+
'TS014dfc77_id' => 3,
|
54
|
+
'TS014dfc77_cr' => '42612abd48551544c72ae36bc40f440a%3Akkmj%3AQG60Q2v4%3A1477350835',
|
55
|
+
'TS014dfc77_76' => 0,
|
56
|
+
'TS014dfc77_md' => 1,
|
57
|
+
'TS014dfc77_rf' => 0,
|
58
|
+
'TS014dfc77_ct' => 0,
|
59
|
+
'TS014dfc77_pd' => 0
|
60
|
+
}.map {|key, value|
|
61
|
+
"#{key}=#{value}"
|
62
|
+
}.join('&')
|
63
|
+
end
|
64
|
+
end
|
data/test/test_basic.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'nebrija'
|
3
|
+
|
4
|
+
MOCKS_DIR = "#{Dir.pwd}/test/mocks"
|
5
|
+
|
6
|
+
class TestMockedParserBasic < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_error_basic
|
9
|
+
assert_not_nil FileRae.new.search("#{MOCKS_DIR}/error.html")[:error]
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_single_basic
|
13
|
+
assert_not_nil FileRae.new.search("#{MOCKS_DIR}/single.html")[:data]
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_multiple_basic
|
17
|
+
assert FileRae.new.search("#{MOCKS_DIR}/multiple.html").length == 2
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class TestMockedParserContent < Test::Unit::TestCase
|
22
|
+
|
23
|
+
def test_single_basic
|
24
|
+
assert FileRae.new.search("#{MOCKS_DIR}/single.html")[:data].length > 20
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_multiple_basic
|
28
|
+
assert FileRae.new.search("#{MOCKS_DIR}/multiple.html")[0][:word] == 'bancar'
|
29
|
+
assert FileRae.new.search("#{MOCKS_DIR}/multiple.html")[1][:word] == 'banco'
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
class TestMockedParserBasic < Test::Unit::TestCase
|
35
|
+
|
36
|
+
def test_single_basic_id
|
37
|
+
assert_not_nil HTTPRae.new.search('MHpGWYJ6YDXX2bw9Ghwm')[:data]
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_error_basic
|
41
|
+
assert_not_nil HTTPRae.new.search('jddhfgsd')[:error]
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_single_basic
|
45
|
+
assert_not_nil HTTPRae.new.search('a')[:data]
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_multiple_basic
|
49
|
+
assert HTTPRae.new.search('banco').length == 2
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class TestParserContent < Test::Unit::TestCase
|
54
|
+
|
55
|
+
def test_single_basic
|
56
|
+
assert HTTPRae.new.search('a')[:data].length > 4
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_multiple_basic
|
60
|
+
assert HTTPRae.new.search('banco')[0][:word] == 'bancar'
|
61
|
+
assert HTTPRae.new.search('banco')[1][:word] == 'banco'
|
62
|
+
end
|
63
|
+
end
|
metadata
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: nebrija
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- ! '@javierhonduco'
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-07-11 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: typhoeus
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rake
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
description: A gem to access the rae dictionary
|
63
|
+
email: a@a.a
|
64
|
+
executables:
|
65
|
+
- nebrija
|
66
|
+
extensions: []
|
67
|
+
extra_rdoc_files: []
|
68
|
+
files:
|
69
|
+
- Rakefile
|
70
|
+
- lib/nebrija.rb
|
71
|
+
- lib/nebrija/parser.rb
|
72
|
+
- bin/nebrija
|
73
|
+
- test/test_basic.rb
|
74
|
+
homepage: http://rubygems.org/gems/nebrija
|
75
|
+
licenses:
|
76
|
+
- MIT
|
77
|
+
post_install_message:
|
78
|
+
rdoc_options: []
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
none: false
|
83
|
+
requirements:
|
84
|
+
- - ! '>='
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0'
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
|
+
none: false
|
89
|
+
requirements:
|
90
|
+
- - ! '>='
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
requirements: []
|
94
|
+
rubyforge_project:
|
95
|
+
rubygems_version: 1.8.23
|
96
|
+
signing_key:
|
97
|
+
specification_version: 3
|
98
|
+
summary: dictionary gem and stuff
|
99
|
+
test_files:
|
100
|
+
- test/test_basic.rb
|