husc 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +57 -12
- data/husc.gemspec +1 -1
- data/lib/husc/version.rb +1 -1
- data/lib/husc.rb +7 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 70da435ce2b15bb485ce958997a91488cf19f00b7faf5da3f25c92a891028508
|
4
|
+
data.tar.gz: b363fa06b547c1a5612889739465af7ea46cbeea6b6922aab0d484fa21169ce6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2190d0269954730626eca1d142c8ffd00614bf98fb19d7ba4d595668e944d769636f26c151211a664ba775b2d9f1c3b5594bbd7d5cde05eec7af31e11f82dfff
|
7
|
+
data.tar.gz: 5f0ea49baf5d2fbf6a707f3b580e95364686c2a3e1439d8c3469c090335790ebeef062176e3d67e3afd7592b23e2256f92f6421a14e4b208c94b4e5a1e737a7f
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
|
1
|
+
Husc
|
2
2
|
=======
|
3
3
|
|
4
|
-
|
4
|
+
A simple crawling utility for Ruby.
|
5
5
|
|
6
6
|
|
7
7
|
## Description
|
@@ -16,29 +16,74 @@ This project enables site crawling and data extraction with xpath and css select
|
|
16
16
|
## Usage
|
17
17
|
### Simple Example
|
18
18
|
```ruby
|
19
|
-
require '
|
19
|
+
require 'husc'
|
20
20
|
|
21
21
|
url = 'http://www.example.com/'
|
22
|
-
doc =
|
22
|
+
doc = Husc(url)
|
23
23
|
|
24
|
-
#
|
24
|
+
# access another url
|
25
|
+
doc.get('another url')
|
26
|
+
|
27
|
+
# get current url
|
28
|
+
doc.url
|
29
|
+
|
30
|
+
# get current site's html
|
31
|
+
doc.html
|
32
|
+
|
33
|
+
# get <table> tags as dict
|
34
|
+
doc.tables
|
35
|
+
# ex) doc.tables['予約・お問い合わせ'] => 050-5596-6465
|
36
|
+
```
|
37
|
+
|
38
|
+
### Scraping Example
|
39
|
+
```ruby
|
40
|
+
# search for nodes by css selector
|
41
|
+
# tag : css('name')
|
42
|
+
# class : css('.name')
|
43
|
+
# id : css('#name')
|
25
44
|
doc.css('div')
|
26
45
|
doc.css('.main-text')
|
27
46
|
doc.css('#tadjs')
|
28
47
|
|
29
|
-
#
|
48
|
+
# search for nodes by xpath
|
30
49
|
doc.xpath('//*[@id="top"]/div[1]')
|
31
50
|
|
32
|
-
#
|
33
|
-
doc.css('div').css('a')[2].attr('href')
|
34
|
-
doc.css('p').innerText()
|
35
|
-
doc.tables # -> Table Tag to Dict
|
36
|
-
|
51
|
+
# other example
|
52
|
+
doc.css('div').css('a')[2].attr('href') # => string object
|
53
|
+
doc.css('p').innerText() # => string object
|
37
54
|
# You do not need to specify "[]" to access the first index
|
38
55
|
```
|
39
56
|
|
57
|
+
### Submitting Form Example
|
58
|
+
1. Specify target node's attribute
|
59
|
+
2. Specify value(int or str) / check(bool) / file_name(str)
|
60
|
+
3. call submit() with form attribute specified
|
61
|
+
```ruby
|
62
|
+
# login
|
63
|
+
doc.send(id:'id attribute', value:'value to send')
|
64
|
+
doc.send(id:'id attribute', value:'value to send')
|
65
|
+
doc.submit(id:'id attribute') # submit
|
66
|
+
|
67
|
+
# post file
|
68
|
+
doc.send(id:'id attribute', file_name:'target file name')
|
69
|
+
|
70
|
+
# checkbox
|
71
|
+
doc.send(id:'id attribute', check:True) # check
|
72
|
+
doc.send(id:'id attribute', check:False) # uncheck
|
73
|
+
|
74
|
+
# example of specify other attribute
|
75
|
+
doc.send(name:'name attribute', value:'hello')
|
76
|
+
doc.send(class:'class attribute', value:100)
|
77
|
+
```
|
78
|
+
|
79
|
+
|
80
|
+
|
40
81
|
|
41
82
|
## Installation
|
42
83
|
```sh
|
43
84
|
$ gem install husc
|
44
|
-
```
|
85
|
+
```
|
86
|
+
|
87
|
+
|
88
|
+
## Contributing
|
89
|
+
Bug reports and pull requests are welcome on GitHub at [https://github.com/AjxLab/PyCrawl](https://github.com/AjxLab/PyCrawl).
|
data/husc.gemspec
CHANGED
@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
|
|
13
13
|
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
14
14
|
|
15
15
|
spec.metadata["homepage_uri"] = spec.homepage
|
16
|
-
spec.metadata["source_code_uri"] = "https://github.com/AjxLab/
|
16
|
+
spec.metadata["source_code_uri"] = "https://github.com/AjxLab/husc"
|
17
17
|
|
18
18
|
# Specify which files should be added to the gem when it is released.
|
19
19
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
data/lib/husc/version.rb
CHANGED
data/lib/husc.rb
CHANGED
@@ -3,11 +3,12 @@ require 'mechanize'
|
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'net/http'
|
5
5
|
require 'kconv'
|
6
|
-
require
|
6
|
+
require 'husc/version'
|
7
7
|
|
8
8
|
module Husc
|
9
9
|
class Error < StandardError; end
|
10
|
-
|
10
|
+
|
11
|
+
class Crawler
|
11
12
|
attr_reader :url, :html, :tables, :params
|
12
13
|
|
13
14
|
# 特殊配列
|
@@ -28,7 +29,7 @@ module Husc
|
|
28
29
|
|
29
30
|
def method_missing(method, *args)
|
30
31
|
if self == []
|
31
|
-
return eval("
|
32
|
+
return eval("Crawler.new(doc: nil).#{method}(*#{args})")
|
32
33
|
end
|
33
34
|
|
34
35
|
return eval("self[0].#{method}(*#{args})")
|
@@ -118,7 +119,7 @@ module Husc
|
|
118
119
|
|
119
120
|
def xpath(locator, single = false)
|
120
121
|
## -----*----- HTMLからXPath指定で要素取得 -----*----- ##
|
121
|
-
elements = CrawlArray.new(@doc.xpath(locator).map {|el|
|
122
|
+
elements = CrawlArray.new(@doc.xpath(locator).map {|el| Crawler.new(doc: el)})
|
122
123
|
if single
|
123
124
|
# シングルノード
|
124
125
|
if elements[0] == nil
|
@@ -134,7 +135,7 @@ module Husc
|
|
134
135
|
|
135
136
|
def css(locator, single = false)
|
136
137
|
## -----*----- HTMLからCSSセレクタで要素取得 -----*----- ##
|
137
|
-
elements = CrawlArray.new(@doc.css(locator).map {|el|
|
138
|
+
elements = CrawlArray.new(@doc.css(locator).map {|el| Crawler.new(doc: el)})
|
138
139
|
if single
|
139
140
|
# シングルノード
|
140
141
|
if elements[0] == nil
|
@@ -211,3 +212,4 @@ module Husc
|
|
211
212
|
end
|
212
213
|
end
|
213
214
|
end
|
215
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: husc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tatsuya Abe
|
@@ -1077,7 +1077,7 @@ licenses:
|
|
1077
1077
|
- MIT
|
1078
1078
|
metadata:
|
1079
1079
|
homepage_uri: https://github.com/AjxLab/husc
|
1080
|
-
source_code_uri: https://github.com/AjxLab/
|
1080
|
+
source_code_uri: https://github.com/AjxLab/husc
|
1081
1081
|
post_install_message:
|
1082
1082
|
rdoc_options: []
|
1083
1083
|
require_paths:
|