genderstat 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/Gemfile +3 -0
- data/README.md +133 -0
- data/bin/genderstat +11 -0
- data/feminine_words.yaml +11 -0
- data/genderstat.gemspec +17 -0
- data/lib/genderstat.rb +48 -0
- data/lib/genderstat/file_reader.rb +9 -0
- data/lib/genderstat/text_reader.rb +16 -0
- data/lib/genderstat/version.rb +3 -0
- data/lib/genderstat/web_reader.rb +35 -0
- data/lib/genderstat/word_counter.rb +15 -0
- data/lib/genderstat/word_counters.rb +53 -0
- data/masculine_words.yaml +11 -0
- data/neutral_words.yaml +9 -0
- metadata +61 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9d948276334bd55efb359df9eb7b3fb86dbfc3c1
|
4
|
+
data.tar.gz: a462a1c66d02849eca80ade99bfc5eeec7fa7d6b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 123eca8ae0ded0774bcdfbbd4fb1754f6db043449e910ffebda9f1bcd835f3cad5d9df97ebfb2a93b28b0cde98298a8a833f675b260530f24ba4401f59073ec1
|
7
|
+
data.tar.gz: 90d1988ad9bbb61896d08bf9fe31079e16064dfe5bdcad29c1f656904056dfc6abfde7715e81b01349892b2c435e185e30f5faff83e86f72b24e5f6fa67d16ee
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
genderstat
|
2
|
+
===
|
3
|
+
genderstat calculates the amount (in terms of percentages and relative ratios) of gendered language in a file or on a site.
|
4
|
+
|
5
|
+
Specifically, it counts the number of gendered:
|
6
|
+
- subject pronouns (she/he)
|
7
|
+
- object pronouns (her/him)
|
8
|
+
- possessives (hers/his)
|
9
|
+
- generic nouns (woman/man, girl/boy, womankind/mankind, etc)
|
10
|
+
|
11
|
+
It does so both in absolute numbers, and also calculates the percent.
|
12
|
+
|
13
|
+
Additionally, it offers the ratios of between each too (e.g. 3.2 times as many
|
14
|
+
masculine words as feminine words).
|
15
|
+
|
16
|
+
Word Lists
|
17
|
+
---
|
18
|
+
genderstat comes with three wordlists:
|
19
|
+
- ```masculine_words.yaml```
|
20
|
+
- ```feminine_words.yaml```
|
21
|
+
- ```neutral_words.yaml```
|
22
|
+
|
23
|
+
They're completely editable, and genderstat will calculate for any files in the
|
24
|
+
directory names ending with ```_words.yaml```
|
25
|
+
|
26
|
+
|
27
|
+
Installation
|
28
|
+
-------
|
29
|
+
|
30
|
+
`gem install genderstat`
|
31
|
+
|
32
|
+
Usage
|
33
|
+
---
|
34
|
+
|
35
|
+
`genderstat [FILE]`
|
36
|
+
|
37
|
+
or
|
38
|
+
|
39
|
+
`genderstat [URL]`
|
40
|
+
|
41
|
+
|
42
|
+
### File example ###
|
43
|
+
|
44
|
+
The `ralph-waldo-emerson.txt` file is a collection of his essays and `kate-chopin.txt` is a collection of her stories (including The Awakening).
|
45
|
+
```
|
46
|
+
% ruby -Ilib bin/genderstat ralph-waldo-emerson.txt
|
47
|
+
total words: 77181
|
48
|
+
feminine words: 85
|
49
|
+
masculine words: 1942
|
50
|
+
neutral words: 1714
|
51
|
+
|
52
|
+
feminine words: 0.11%
|
53
|
+
masculine words: 2.52%
|
54
|
+
neutral words: 2.22%
|
55
|
+
|
56
|
+
The ratio of feminine to masculine words is 0.04
|
57
|
+
The ratio of feminine to neutral words is 0.05
|
58
|
+
The ratio of masculine to feminine words is 22.85
|
59
|
+
The ratio of masculine to neutral words is 1.13
|
60
|
+
The ratio of neutral to feminine words is 20.16
|
61
|
+
The ratio of neutral to masculine words is 0.88
|
62
|
+
|
63
|
+
|
64
|
+
% ruby -Ilib bin/genderstat kate-chopin.txt
|
65
|
+
total words: 67023
|
66
|
+
feminine words: 3147
|
67
|
+
masculine words: 1724
|
68
|
+
neutral words: 1095
|
69
|
+
|
70
|
+
feminine words: 4.7%
|
71
|
+
masculine words: 2.57%
|
72
|
+
neutral words: 1.63%
|
73
|
+
|
74
|
+
The ratio of feminine to masculine words is 1.83
|
75
|
+
The ratio of feminine to neutral words is 2.87
|
76
|
+
The ratio of masculine to feminine words is 0.55
|
77
|
+
The ratio of masculine to neutral words is 1.57
|
78
|
+
The ratio of neutral to feminine words is 0.35
|
79
|
+
The ratio of neutral to masculine words is 0.64
|
80
|
+
```
|
81
|
+
|
82
|
+
### URL example ###
|
83
|
+
```
|
84
|
+
% ruby -Ilib bin/genderstat feministing.com
|
85
|
+
total words: 4908
|
86
|
+
feminine words: 20
|
87
|
+
masculine words: 3
|
88
|
+
neutral words: 17
|
89
|
+
|
90
|
+
feminine words: 0.41%
|
91
|
+
masculine words: 0.06%
|
92
|
+
neutral words: 0.35%
|
93
|
+
|
94
|
+
The ratio of feminine to masculine words is 6.67
|
95
|
+
The ratio of feminine to neutral words is 1.18
|
96
|
+
The ratio of masculine to feminine words is 0.15
|
97
|
+
The ratio of masculine to neutral words is 0.18
|
98
|
+
The ratio of neutral to feminine words is 0.85
|
99
|
+
The ratio of neutral to masculine words is 5.67
|
100
|
+
|
101
|
+
%ruby -Ilib bin/genderstat stallman.org
|
102
|
+
total words: 5673
|
103
|
+
feminine words: 1
|
104
|
+
masculine words: 16
|
105
|
+
neutral words: 55
|
106
|
+
|
107
|
+
feminine words: 0.02%
|
108
|
+
masculine words: 0.28%
|
109
|
+
neutral words: 0.97%
|
110
|
+
|
111
|
+
The ratio of feminine to masculine words is 0.06
|
112
|
+
The ratio of feminine to neutral words is 0.02
|
113
|
+
The ratio of masculine to feminine words is 16.0
|
114
|
+
The ratio of masculine to neutral words is 0.29
|
115
|
+
The ratio of neutral to feminine words is 55.0
|
116
|
+
The ratio of neutral to masculine words is 3.44
|
117
|
+
```
|
118
|
+
todo
|
119
|
+
---
|
120
|
+
- Add tests
|
121
|
+
- Add support for more stats (like statistical significance?)
|
122
|
+
- Add support for reading from stdin
|
123
|
+
- Handle scenario of https->redirect (rather than weird nil error)
|
124
|
+
|
125
|
+
Dependencies
|
126
|
+
------------
|
127
|
+
- Ruby 2.0+
|
128
|
+
- ```open_uri_redirections``` gem
|
129
|
+
|
130
|
+
You can get the gem by running ```bundle install```.
|
131
|
+
|
132
|
+
It's not strictly necessary, it's just allows for HTTP->HTTPS redirections.
|
133
|
+
|
data/bin/genderstat
ADDED
data/feminine_words.yaml
ADDED
data/genderstat.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
$:.push File.expand_path("../lib", __FILE__)
|
2
|
+
require "genderstat/version"
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = 'genderstat'
|
6
|
+
s.version = Genderstat::VERSION
|
7
|
+
s.date = '2014-08-23'
|
8
|
+
s.summary = "A gendered language frequency calculator"
|
9
|
+
s.description = "Calculate the relative frequencies of gendered language in a file or on the web"
|
10
|
+
s.authors = ["Sean Collins"]
|
11
|
+
s.email = 'sean@cllns.com'
|
12
|
+
s.files = `git ls-files`.split($/)
|
13
|
+
s.executables = 'genderstat'
|
14
|
+
s.required_ruby_version = '>= 2.0'
|
15
|
+
s.homepage = 'http://github.org/cllns/genderstat'
|
16
|
+
s.license = 'MIT'
|
17
|
+
end
|
data/lib/genderstat.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
# A program to determine how "masculine" a piece of writing is.
|
3
|
+
# Sean Collins 11/11/12
|
4
|
+
require 'yaml'
|
5
|
+
require 'genderstat/text_reader'
|
6
|
+
require 'genderstat/word_counters'
|
7
|
+
|
8
|
+
class Genderstat
|
9
|
+
DECIMAL_DIGITS_OF_PRECISION = 2
|
10
|
+
|
11
|
+
def initialize arg
|
12
|
+
@word_counters = WordCounters.new
|
13
|
+
@text_reader = TextReader.new arg
|
14
|
+
end
|
15
|
+
|
16
|
+
def calculate
|
17
|
+
@all_words = @text_reader.read
|
18
|
+
@all_words.each { |word| @word_counters.check(word) }
|
19
|
+
end
|
20
|
+
|
21
|
+
def print_all_results
|
22
|
+
puts "total words: #{@all_words.count.to_s}"
|
23
|
+
print_totals
|
24
|
+
print_percentages
|
25
|
+
print_ratios
|
26
|
+
end
|
27
|
+
|
28
|
+
def print_totals
|
29
|
+
@word_counters.get_totals.each do |name, count|
|
30
|
+
puts "#{name} words: #{count}"
|
31
|
+
end
|
32
|
+
puts
|
33
|
+
end
|
34
|
+
|
35
|
+
def print_percentages
|
36
|
+
@word_counters.get_percentages(@all_words.count).each do |name, percentage|
|
37
|
+
puts "#{name} words: #{percentage}%"
|
38
|
+
end
|
39
|
+
puts
|
40
|
+
end
|
41
|
+
|
42
|
+
def print_ratios
|
43
|
+
@word_counters.get_ratios.each do |name, ratio|
|
44
|
+
puts "The ratio of #{name.gsub("_", " ")} words is #{ratio}"
|
45
|
+
end
|
46
|
+
puts
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'genderstat/file_reader'
|
2
|
+
require 'genderstat/web_reader'
|
3
|
+
|
4
|
+
class TextReader
|
5
|
+
def initialize file_locator
|
6
|
+
if File.exist? file_locator
|
7
|
+
@reader = FileReader.new file_locator
|
8
|
+
else
|
9
|
+
@reader = WebReader.new file_locator
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def read
|
14
|
+
@reader.read.downcase.split
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'open_uri_redirections'
|
3
|
+
require 'socket'
|
4
|
+
|
5
|
+
class WebReader
|
6
|
+
def initialize url
|
7
|
+
@url = clean_up_url url
|
8
|
+
end
|
9
|
+
|
10
|
+
def read
|
11
|
+
begin
|
12
|
+
open(@url, :allow_redirections => :safe).read
|
13
|
+
rescue Exception => ex
|
14
|
+
handle_web_exceptions ex
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
def handle_web_exceptions ex
|
20
|
+
if ex == OpenURI::HTTPError || ex == SocketError
|
21
|
+
abort "Could not open: #{url}"
|
22
|
+
else
|
23
|
+
abort ex.to_s
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# if the URL is doesn't match the URI regex, then prepend it with http://
|
28
|
+
def clean_up_url url
|
29
|
+
if (url =~ URI.regexp).nil?
|
30
|
+
"http://#{url}"
|
31
|
+
else
|
32
|
+
url
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
class WordCounter
|
4
|
+
attr_reader :count, :name
|
5
|
+
|
6
|
+
def initialize filename
|
7
|
+
@words = Set.new YAML.load_file(filename)
|
8
|
+
@name = filename.split('_').first.split('/').last
|
9
|
+
@count = 0
|
10
|
+
end
|
11
|
+
|
12
|
+
def is_in_here? word_in_question
|
13
|
+
@count += 1 if @words.include? word_in_question
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'genderstat/word_counter'
|
2
|
+
|
3
|
+
class WordCounters
|
4
|
+
def initialize
|
5
|
+
@word_counters = []
|
6
|
+
# The word lists are two directories higher than this file, so that's
|
7
|
+
# how we have to reference their locations
|
8
|
+
this_file_path = File.dirname(__FILE__)
|
9
|
+
word_list_relative_location = "../../*_words.yaml"
|
10
|
+
word_list_location = File.join(this_file_path, word_list_relative_location)
|
11
|
+
|
12
|
+
Dir.glob(word_list_location).each do |filename|
|
13
|
+
@word_counters << WordCounter.new(filename)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
def check word
|
19
|
+
@word_counters.each { |wc| wc.is_in_here? word }
|
20
|
+
end
|
21
|
+
|
22
|
+
def get_totals
|
23
|
+
@word_counters.each_with_object({}) do |wc, hash|
|
24
|
+
hash[wc.name] = wc.count
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def get_percentages total_word_count
|
29
|
+
@word_counters.each_with_object({}) do |wc, hash|
|
30
|
+
hash[wc.name] = round( 100 * (wc.count.to_f / total_word_count))
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# We get the ratios of counts among all word_lists
|
35
|
+
# This returns a hash with keys that are named "name_to_other_name"
|
36
|
+
# This loop does twice as much work as it needs to, since the ratios
|
37
|
+
# are reciprocals of one another, but oh well! It's only division.
|
38
|
+
def get_ratios
|
39
|
+
@word_counters.each_with_object({}) do |wc, hash|
|
40
|
+
# We skip over the ratio of name_to_name, since it'll always be one
|
41
|
+
@other_word_counters = @word_counters - [wc]
|
42
|
+
@other_word_counters.each do |other_wc|
|
43
|
+
ratio = round(( wc.count.to_f / other_wc.count.to_f) )
|
44
|
+
hash["#{wc.name}_to_#{other_wc.name}"] = ratio
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def round float
|
50
|
+
float.round(Genderstat::DECIMAL_DIGITS_OF_PRECISION)
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
data/neutral_words.yaml
ADDED
metadata
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: genderstat
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Sean Collins
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-08-23 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Calculate the relative frequencies of gendered language in a file or
|
14
|
+
on the web
|
15
|
+
email: sean@cllns.com
|
16
|
+
executables:
|
17
|
+
- genderstat
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- ".gitignore"
|
22
|
+
- Gemfile
|
23
|
+
- README.md
|
24
|
+
- bin/genderstat
|
25
|
+
- feminine_words.yaml
|
26
|
+
- genderstat.gemspec
|
27
|
+
- lib/genderstat.rb
|
28
|
+
- lib/genderstat/file_reader.rb
|
29
|
+
- lib/genderstat/text_reader.rb
|
30
|
+
- lib/genderstat/version.rb
|
31
|
+
- lib/genderstat/web_reader.rb
|
32
|
+
- lib/genderstat/word_counter.rb
|
33
|
+
- lib/genderstat/word_counters.rb
|
34
|
+
- license.txt
|
35
|
+
- masculine_words.yaml
|
36
|
+
- neutral_words.yaml
|
37
|
+
homepage: http://github.org/cllns/genderstat
|
38
|
+
licenses:
|
39
|
+
- MIT
|
40
|
+
metadata: {}
|
41
|
+
post_install_message:
|
42
|
+
rdoc_options: []
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '2.0'
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
requirements: []
|
56
|
+
rubyforge_project:
|
57
|
+
rubygems_version: 2.2.0
|
58
|
+
signing_key:
|
59
|
+
specification_version: 4
|
60
|
+
summary: A gendered language frequency calculator
|
61
|
+
test_files: []
|