liveblog-indexer 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/lib/liveblog-indexer.rb +47 -17
- metadata +7 -7
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 711ba6e2a290dfc67dfc1703ffcddf417c7c90d3
|
4
|
+
data.tar.gz: 5525552551eee1953370b99034d2a712bb308ac2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8648c96fdf3f0bf7c0b1ccc72f99bef2759db323a159e4361fc479f7d0d5847b2ad0f04361533e161f1a0cd6eb5b85544823f0159cd270adb29e6ac00cce10ed
|
7
|
+
data.tar.gz: 075d272a0af388c4addbcfb141ac0de8143d60b5b7ada4575d93a9ba84cc8e5b24b3c08490171196708012dbbed65dd5f2a588bdde538290f20ba121f79ced1a
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data.tar.gz.sig
CHANGED
Binary file
|
data/lib/liveblog-indexer.rb
CHANGED
@@ -4,13 +4,14 @@
|
|
4
4
|
|
5
5
|
require 'xws'
|
6
6
|
require 'json'
|
7
|
-
require '
|
7
|
+
require 'rxfhelper'
|
8
8
|
|
9
9
|
|
10
10
|
class LiveBlogIndexer
|
11
11
|
|
12
|
-
def initialize(filepath=
|
12
|
+
def initialize(filepath='wordindex.json', urls_indexed: 'urls-indexed.json')
|
13
13
|
|
14
|
+
@wordindex_filepath, @urls_index_filepath = filepath, urls_indexed
|
14
15
|
@master = if filepath and File.exists? filepath then
|
15
16
|
JSON.parse(File.read(filepath))
|
16
17
|
else
|
@@ -18,17 +19,26 @@ class LiveBlogIndexer
|
|
18
19
|
end
|
19
20
|
|
20
21
|
@xws = XWS.new
|
22
|
+
@url_index = {}
|
21
23
|
|
22
24
|
end
|
23
25
|
|
24
26
|
def add_index(src)
|
25
27
|
|
26
|
-
doc =
|
28
|
+
doc = if src.is_a? String then
|
29
|
+
Rexle.new(RXFHelper.read(src).first )
|
30
|
+
else
|
31
|
+
src
|
32
|
+
end
|
33
|
+
|
34
|
+
link = doc.root.element('summary/link/text()')
|
35
|
+
return unless link
|
36
|
+
|
27
37
|
sections = doc.root.xpath 'records/section'
|
28
38
|
|
29
39
|
sections.each do |section|
|
30
40
|
|
31
|
-
url = "%s/#%s" % [
|
41
|
+
url = "%s/#%s" % [link[/^https?:\/\/[^\/]+(.*)(?=\/$)/,1], \
|
32
42
|
section.attributes[:id]]
|
33
43
|
|
34
44
|
h = @xws.scan section.element('details')
|
@@ -52,25 +62,45 @@ class LiveBlogIndexer
|
|
52
62
|
end # /h
|
53
63
|
end # /section
|
54
64
|
end # /add_index
|
55
|
-
|
65
|
+
|
66
|
+
def crawl(location)
|
67
|
+
|
68
|
+
index_file location
|
69
|
+
save @wordindex_filepath
|
70
|
+
File.write @urls_index_filepath, @url_index.to_json
|
71
|
+
|
72
|
+
end
|
56
73
|
|
57
74
|
def save(filepath=nil)
|
58
75
|
|
59
76
|
File.write filepath, @master.to_json
|
60
77
|
puts 'saved ' + File.basename(filepath)
|
61
78
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
def index_file(location)
|
84
|
+
|
85
|
+
puts 'indexing : ' + location.inspect
|
86
|
+
doc = Rexle.new(RXFHelper.read(location).first)
|
87
|
+
summary = doc.root.element 'summary'
|
88
|
+
return unless summary
|
89
|
+
|
90
|
+
result = add_index doc
|
91
|
+
return unless result
|
92
|
+
|
93
|
+
prev_day = summary.text 'prev_day'
|
94
|
+
|
95
|
+
if prev_day then
|
96
|
+
url = prev_day + 'formatted.xml'
|
97
|
+
@url_index[url] = {last_indexed: Time.now}
|
98
|
+
index_file(url)
|
70
99
|
end
|
71
|
-
|
72
|
-
|
73
|
-
|
100
|
+
end
|
101
|
+
|
102
|
+
def save_urlsindex(filepath)
|
103
|
+
|
74
104
|
end
|
75
105
|
|
76
|
-
end
|
106
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: liveblog-indexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -31,7 +31,7 @@ cert_chain:
|
|
31
31
|
mOnsi2V1CXpq2biJtSgD7mBx4cO9FXgbK3Xnsv45ygAPo6jj4Eb34udqz+0v88Ys
|
32
32
|
KoTqNQOniHAW2w==
|
33
33
|
-----END CERTIFICATE-----
|
34
|
-
date: 2015-11-
|
34
|
+
date: 2015-11-13 00:00:00.000000000 Z
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: xws
|
@@ -54,25 +54,25 @@ dependencies:
|
|
54
54
|
- !ruby/object:Gem::Version
|
55
55
|
version: 0.1.1
|
56
56
|
- !ruby/object:Gem::Dependency
|
57
|
-
name:
|
57
|
+
name: rxfhelper
|
58
58
|
requirement: !ruby/object:Gem::Requirement
|
59
59
|
requirements:
|
60
60
|
- - "~>"
|
61
61
|
- !ruby/object:Gem::Version
|
62
|
-
version: '
|
62
|
+
version: '0.2'
|
63
63
|
- - ">="
|
64
64
|
- !ruby/object:Gem::Version
|
65
|
-
version:
|
65
|
+
version: 0.2.3
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
68
|
version_requirements: !ruby/object:Gem::Requirement
|
69
69
|
requirements:
|
70
70
|
- - "~>"
|
71
71
|
- !ruby/object:Gem::Version
|
72
|
-
version: '
|
72
|
+
version: '0.2'
|
73
73
|
- - ">="
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
75
|
+
version: 0.2.3
|
76
76
|
description:
|
77
77
|
email: james@r0bertson.co.uk
|
78
78
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|