oald_parser 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/oald_parser/facade.rb +1 -1
- data/lib/oald_parser/formatter.rb +12 -19
- data/lib/oald_parser/page_parser.rb +26 -23
- data/lib/oald_parser.rb +5 -1
- metadata +2 -2
data/lib/oald_parser/facade.rb
CHANGED
@@ -16,7 +16,7 @@ module OaldParser
|
|
16
16
|
def self.create_facade
|
17
17
|
downloader = PageDownloader.new('http://www.oxfordadvancedlearnersdictionary.com/dictionary')
|
18
18
|
parser = PageParser.new
|
19
|
-
formatter = Formatter.new
|
19
|
+
formatter = Formatter.new
|
20
20
|
extractor = WordExtractor.new
|
21
21
|
Facade.new(downloader, parser, formatter, extractor)
|
22
22
|
end
|
@@ -2,36 +2,29 @@ require 'nokogiri'
|
|
2
2
|
|
3
3
|
module OaldParser
|
4
4
|
class Formatter
|
5
|
-
def initialize(options)
|
6
|
-
@options = options
|
7
|
-
end
|
8
|
-
|
9
5
|
def format(page)
|
10
|
-
|
11
|
-
format_blocks(page.blocks)
|
12
|
-
else
|
13
|
-
format_items(page.items)
|
14
|
-
end
|
6
|
+
format_blocks(page.blocks)
|
15
7
|
end
|
16
8
|
|
17
9
|
private
|
18
|
-
def format_blocks(blocks
|
10
|
+
def format_blocks(blocks)
|
19
11
|
blocks.collect do |block|
|
20
12
|
res = ''
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
13
|
+
unless block.text.empty?
|
14
|
+
res += block.text.upcase
|
15
|
+
res += "\n"
|
16
|
+
res += '-' * 20
|
17
|
+
res += "\n"
|
18
|
+
end
|
25
19
|
res += format_items(block.items)
|
26
20
|
res
|
27
21
|
end.join("\n\n")
|
28
22
|
end
|
29
23
|
|
30
|
-
def format_items(items
|
24
|
+
def format_items(items)
|
31
25
|
items.collect do |item|
|
32
|
-
res =
|
33
|
-
|
34
|
-
if !item.examples.empty?
|
26
|
+
res = item.text
|
27
|
+
unless item.examples.empty?
|
35
28
|
res += "\n"
|
36
29
|
res += format_examples(item.examples)
|
37
30
|
end
|
@@ -40,7 +33,7 @@ module OaldParser
|
|
40
33
|
end
|
41
34
|
|
42
35
|
def format_examples(examples)
|
43
|
-
examples.collect
|
36
|
+
examples.collect{|e| "+ #{e}"}.join("\n")
|
44
37
|
end
|
45
38
|
end
|
46
39
|
end
|
@@ -2,9 +2,13 @@ require 'nokogiri'
|
|
2
2
|
|
3
3
|
module OaldParser
|
4
4
|
class Page
|
5
|
-
attr_reader :blocks
|
6
|
-
def initialize(blocks
|
7
|
-
@blocks
|
5
|
+
attr_reader :blocks
|
6
|
+
def initialize(blocks)
|
7
|
+
@blocks = blocks
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.empty
|
11
|
+
Page.new []
|
8
12
|
end
|
9
13
|
end
|
10
14
|
|
@@ -26,26 +30,28 @@ module OaldParser
|
|
26
30
|
def parse(page)
|
27
31
|
parsed = Nokogiri::HTML(page)
|
28
32
|
if blocks_on_page? parsed
|
29
|
-
Page.new(parse_block(parsed)
|
30
|
-
elsif def_on_page? parsed
|
31
|
-
Page.new([], parse_def(parsed))
|
33
|
+
Page.new(parse_block(parsed))
|
32
34
|
else
|
33
|
-
|
35
|
+
parse_page_from_items(parsed)
|
34
36
|
end
|
35
37
|
end
|
36
38
|
|
37
39
|
private
|
38
|
-
def
|
39
|
-
|
40
|
+
def parse_page_from_items(parsed)
|
41
|
+
items = parse_items(parsed)
|
42
|
+
if items.empty?
|
43
|
+
Page.empty
|
44
|
+
else
|
45
|
+
Page.new([Block.new("", items)])
|
46
|
+
end
|
40
47
|
end
|
41
48
|
|
42
|
-
def
|
43
|
-
page.css('div.
|
49
|
+
def blocks_on_page?(page)
|
50
|
+
page.css('div.sd-g').first
|
44
51
|
end
|
45
52
|
|
46
53
|
def parse_block(page)
|
47
|
-
|
48
|
-
block_nodes.collect do |block|
|
54
|
+
page.css('div.sd-g').collect do |block|
|
49
55
|
block_text = all_except(block, 'n-g')
|
50
56
|
items = parse_items(block)
|
51
57
|
Block.new(block_text, items)
|
@@ -53,19 +59,16 @@ module OaldParser
|
|
53
59
|
end
|
54
60
|
|
55
61
|
def parse_items(block)
|
56
|
-
|
57
|
-
item_nodes.collect do |item|
|
62
|
+
items = block.css('span.n-g').collect do |item|
|
58
63
|
item_text = all_except(item, 'x-g')
|
59
64
|
Item.new(item_text, parse_examples(item))
|
60
|
-
end
|
61
|
-
end
|
65
|
+
end
|
62
66
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
end
|
67
|
+
if block.css('div.def_block').first
|
68
|
+
item_text = block.css('div.def_block').first.text.strip
|
69
|
+
items << Item.new(item_text, parse_examples(block))
|
70
|
+
end
|
71
|
+
items
|
69
72
|
end
|
70
73
|
|
71
74
|
def parse_examples(item)
|
data/lib/oald_parser.rb
CHANGED
@@ -8,9 +8,13 @@ require_relative 'oald_parser/page_parser'
|
|
8
8
|
#include OaldParser
|
9
9
|
#
|
10
10
|
#downloader = PageDownloader.new("http://www.oxfordadvancedlearnersdictionary.com/dictionary")
|
11
|
-
#page = downloader.download("
|
11
|
+
#page = downloader.download("a")
|
12
|
+
##puts page
|
13
|
+
#
|
12
14
|
#parser = PageParser.new
|
13
15
|
#parsed = parser.parse(page)
|
16
|
+
#puts parsed.inspect
|
17
|
+
#
|
14
18
|
#formatter = Formatter.new(items: 15)
|
15
19
|
#puts formatter.format(parsed)
|
16
20
|
|